Python unws示例，utils.utils.unws Python示例

示例#1

0

显示文件

def get_all_dossiers(**kwargs):
    for year in range(datetime.date.today().year, 1971, -1):
        tree = fetch(
            'https://oeil.secure.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&noHeader=false&q=objectReferenceN:N-%s/*\(*\)'
            % (year))
        tmp = tree.xpath(
            '//span[@class="ep_name" and (starts-with(normalize-space(),"Results found :") or starts-with(normalize-space(),"Result found :"))]/text()'
        )
        if not tmp:
            log(1, "no dossiers found for %d" % year)
            raise ValueError("failed to find number of dossiers for year %d" %
                             year)
        tmp = unws(tmp[0])
        count = int(tmp[tmp.index(":") + 1:])
        log(4, "year %d, count %d" % (year, count))
        #tree=fetch('https://oeil.secure.europarl.europa.eu/oeil/popups/printresultlist.xml?q=objectReferenceN:N-%s/????\(*\)&lang=en&s1&all&limit=%s&lang=en'
        #           % (year, count), prune_xml=True)
        tree = fromstring(
            fetch_raw(
                'https://oeil.secure.europarl.europa.eu/oeil/popups/printresultlist.xml?q=objectReferenceN:N-%s/*\(*\)&lang=en&s1&all&limit=%s&lang=en'
                % (year, count)).encode("utf8"))
        items = tree.xpath('//item')
        i = 0
        for item in items:
            url = html.unescape(
                urljoin(BASE_URL, str(item.xpath('./link/text()')[0])))
            ref = unws(item.xpath('./reference/text()')[0])
            if '*' in ref: ref = ref[:ref.index('*')]
            log(4, 'adding dossier scraping job %s' % url)
            payload = dict(kwargs)
            payload['url'] = url
            add_job('dossier', payload=payload)
            i += 1
        if i != count: log(1, "total %d, expected %d" % (i, count))

示例#2

0

显示文件

def splitNames(text):
    text = text.split(' on behalf ', 1)[0]
    res = []
    for delim in (', ', ' and ', ' & ', '; ', ','):
        if not res:
            res = filter(None, [
                item[:-1] if item[-1] in [',', "'", ';'] else item
                for item in unws(text).split(delim) if item
            ])
            continue
        res = filter(None, [
            item[:-1] if item[-1] in [',', "'", ';'] else item for elem in res
            for item in elem.split(delim) if item
        ])
    # only for devel.
    # for mep in res:
    #     if mep.startswith('on behalf of'): continue
    #     if mep.endswith('Shadow)'):
    #         logger.info('shadow: %s' % mep)
    res = [
        mep if not mep.endswith('Shadow)') else mep[:mep.rfind(' (')]
        for mep in res if not mep.startswith('on behalf of')
    ]
    res = [unws(y) for x in res for y in mansplits.get(x, [x])]
    return [mepmaps.get(x, x) for x in res]

示例#3

0

显示文件

文件： ep_meps.py 项目： parltrack/parltrack

def getAddress(root):
    res={}
    for div in root.xpath('../following-sibling::div[@class="boxcontent " or @class="boxcontent nobordertop"]/ul[@class="contact"]'):
        key=unws(''.join(div.xpath('./preceding-sibling::h4/text()')))
        if key not in ['Bruxelles', 'Strasbourg', 'Postal address', 'Luxembourg']:
            continue
        if key=='Bruxelles': key=u'Brussels'
        elif key=='Postal address': key=u'Postal'
        res[key]={}
        if key in ['Brussels', 'Strasbourg', 'Luxembourg']:
            tmp=div.xpath('./following-sibling::ul[@class="link_collection_noborder"]//span[@class="phone"]/text()')
            if tmp:
                res[key][u'Phone'] = unws(tmp[0]).replace('(0)','')
            tmp=div.xpath('./following-sibling::ul[@class="link_collection_noborder"]//span[@class="fax"]/text()')
            if tmp:
                res[key][u'Fax'] = unws(tmp[0]).replace('(0)','')
        tmp=[unws(x) for x in div.xpath('./li[@class="address"]//text()') if len(unws(x))]
        if key=='Strasbourg':
            res[key][u'Address']=dict(zip([u'Organization',u'Building', u'Office', u'Street',u'Zip1', u'Zip2'],tmp))
            res[key][u'Address']['City']=res[key]['Address']['Zip2'].split()[1]
            res[key][u'Address']['Zip2']=res[key]['Address']['Zip2'].split()[0]
            res[key][u'Address']['building_code']=buildings.get(res[key]['Address']['Building'])
        elif key=='Brussels':
            res[key][u'Address']=dict(zip([u'Organization',u'Building', u'Office', u'Street',u'Zip'],tmp))
            res[key][u'Address']['City']=res[key]['Address']['Zip'].split()[1]
            res[key][u'Address']['Zip']=res[key]['Address']['Zip'].split()[0]
            res[key][u'Address']['building_code']=buildings.get(res[key]['Address']['Building'])
        elif key=='Luxembourg':
            res[key][u'Address']=tmp
        elif key=='Postal':
            res[key]=tmp
        else:
            logger.error("wtf %s" % key)
    return res

示例#4

0

显示文件

文件： oeil.py 项目： parltrack/parltrack

def scrape_basic(tree):
    res=form2obj((tree.xpath('//table[@id="technicalInformations"]') or [None])[0],detailsheaders) or {}
    if 'dossier_of_the_committee' in res:
        res['dossier_of_the_committee']=';'.join(sorted((unws(x) for x in res['dossier_of_the_committee'].split(';'))))
    table=(tree.xpath('//table[@id="basic_information"]') or [None])[0]
    if table is None: return res
    res.update({'stage_reached': (table.xpath('.//p[@class="pf_stage"]/text()') or [''])[0].strip(),
                'reference': (table.xpath('.//span[@class="basic_reference"]/text()') or [''])[0].strip(),
                'type': (table.xpath('.//p[@class="basic_procedurefile"]/text()') or [''])[0].strip(),
                'title': (table.xpath('.//p[@class="basic_title"]/text()') or [''])[0].strip(),
                })
    if '' in res:
        del res['']
    if 'legal_basis' in res:
        res[u'legal_basis']=sorted((unws(x) for x in res['legal_basis'].split(';')))
    fields=table.xpath('.//p[@class="basic_content"]/*')
    firstline=u' '.join((table.xpath('.//p[@class="basic_content"]/text()') or [''])[0].split())
    attrib=u'summary'
    if len(firstline):
        if not attrib in res: res[attrib]=[]
        res[attrib]=[firstline]
    for elem in fields:
        if elem.tag=='br' and elem.tail and elem.tail.strip():
            if not attrib in res: res[attrib]=[]
            res[attrib].append(u' '.join(elem.tail.split()))
        elif elem.tag=='strong':
            if attrib in res and res[attrib]:
                res[attrib].sort()
            attrib=u' '.join(elem.xpath('text()')[0].split())
            attrib=detailsheaders.get(attrib,attrib).lower().replace(u" ",u"_")
            if attrib:
                res[attrib]=[]
    return res

示例#5

0

显示文件

def getdoclist(node):
    txt = [x for x in node.xpath('.//text()') if unws(x)]
    i = 0
    res = []
    while i + 1 < len(txt):
        if unws(txt[i])[-1] == u"\u2013":
            res.append({
                u'type':
                unws(txt[i])[:-2],
                u'title':
                unws(txt[i + 1]),
                u'url':
                urljoin(BASE_URL, txt[i + 1].getparent().get('href'))
            })
            i += 2
        elif len(unws(txt[i]).split(u" \u2013 ")) > 1:
            res.append({
                u'type':
                unws(txt[i].split(u" \u2013 ")[0]),
                u'title':
                unws(txt[i].split(u" \u2013 ")[1]
                     if len(txt[i].split(u" \u2013 ")) > 1 else u'')
            })
            i += 1
        else:
            i += 1
    if i < len(txt) and len(unws(txt[i]).split(u" \u2013 ")) > 1:
        res.append({
            u'type': unws(txt[i]).split(u" \u2013 ")[0],
            u'title': unws(txt[i]).split(u" \u2013 ")[1]
        })
    return res

示例#6

0

显示文件

def parse_hist_date(txt):
    tmp = txt.split(' / ')
    if len(tmp) == 2:
        (start, end) = tmp
    elif len(tmp) == 1:
        start = txt.split()[0]
        end = "31-12-9999"
    else:
        raise ValueError
    return datetime.strptime(unws(start), u"%d-%m-%Y"), datetime.strptime(
        unws(end), u"%d-%m-%Y")

示例#7

0

显示文件

def parse_addr(root):
    # addresses
    addrs = {}
    for li in root.xpath('//section[@id="contacts"]//div[@class="card-body"]'):
        key = unws(''.join(li.xpath('./div[1]//text()')))
        if key == 'Bruxelles': key = 'Brussels'
        addrs[key] = {}
        if key in ['Brussels', 'Strasbourg']:
            phone = li.xpath(
                './/li/i[@class="erpl_icon erpl_icon-phone"]/../a/@href')
            if phone:
                addrs[key]['Phone'] = phone[0][4:].replace(
                    "+33(0)388", "+333 88").replace("+32(0)228", "+322 28")
            fax = li.xpath(
                './/li/i[@class="erpl_icon erpl_icon-fax"]/../a/@href')
            if fax:
                addrs[key]['Fax'] = fax[0][4:].replace(
                    "+33(0)388", "+333 88").replace("+32(0)228", "+322 28")
        #tmp=[unws(x) for x in li.xpath('.//li[1]//text()') if len(unws(x))]
        tmp = [
            unws(x) for x in li.xpath(
                './/div[@class="erpl_contact-card-list"]/span/text()')
            if len(unws(x))
        ]
        if key == 'Strasbourg':
            addrs[key][u'Address'] = dict(
                zip([
                    u'Organization', u'Building', u'Office', u'Street',
                    u'Zip1', u'Zip2'
                ], tmp))
            addrs[key][u'Address']['City'] = addrs[key]['Address'][
                'Zip2'].split()[1]
            addrs[key][u'Address']['Zip2'] = addrs[key]['Address'][
                'Zip2'].split()[0]
            addrs[key][u'Address']['building_code'] = buildings.get(
                addrs[key]['Address']['Building'])
        elif key == u'Brussels':
            addrs[key][u'Address'] = dict(
                zip([
                    u'Organization', u'Building', u'Office', u'Street', u'Zip'
                ], tmp))
            addrs[key][u'Address']['City'] = addrs[key]['Address'][
                'Zip'].split()[1]
            addrs[key][u'Address']['Zip'] = addrs[key]['Address']['Zip'].split(
            )[0]
            addrs[key][u'Address']['building_code'] = buildings.get(
                addrs[key]['Address']['Building'])
        elif key == 'Luxembourg':
            addrs[key][u'Address'] = tmp
        elif key == 'Postal address':
            addrs['Postal'] = tmp
    return addrs

示例#8

0

显示文件

文件： oeil.py 项目： parltrack/parltrack

def toLinks(node):
    if node is None: return
    for br in node.xpath("br"):
        br.text="\n"
    ret=[]
    for line in node.xpath(".//text()"):
        if len(unws(line))<1:
            continue
        if line.getparent().tag=='a':
            ret.append({u'title': unws(line), 'url': unicode(urljoin(BASE_URL,line.getparent().get('href')),'utf8')})
        else:
            ret.append({u'title': unws(line)})
    return ret

示例#9

0

显示文件

文件： dump_schema.py 项目： parltrack/parltrack

def scan(d, node):
    """ helper for dump_schema"""
    if not 'types' in node:
        node['types'] = {}
    if isinstance(d, dict):
        for k, v in d.items():
            if not 'items' in node:
                node['items'] = {}
            if not k in node['items']:
                node['items'][k] = {'name': k}
            node['items'][k] = scan(v, node['items'][k])
    elif isinstance(d, list):
        if not 'elems' in node:
            node['elems'] = {}
        for v in d:
            stype = type(v)
            node['elems'][stype] = scan(v, node['elems'].get(stype, {}))
    if isinstance(d, str):
        d = unws(d) or None
    mtype = type(d)
    tmp = node['types'].get(mtype, {'count': 0, 'example': None})
    tmp['count'] += 1
    if d and not tmp['example'] and not isinstance(d, dict):
        tmp['example'] = d
    node['types'][mtype] = tmp
    return node

示例#10

0

显示文件

def extract_table(table, url, date=None):
    trs = table.xpath('.//tr')
    header = trs[0]
    tds = header.xpath('.//td')
    if len(tds) < 2:
        log(
            1, "vote table has less than two columns in the header: %s %s" %
            (url, tostring(trs[0])))
        raise ValueError
    type = junws(tds[1])
    if type not in {"+", "-", "0"}:
        log(
            1, "vote header type is unexpected value %s in %s" %
            (repr(type), url))
        raise ValueError
    res = {'total': int(junws(tds[0])), 'type': type, 'meps': []}
    for tr in trs[1:]:
        tds = tr.xpath('.//td')
        if len(tds) < 2:
            log(
                1, "vote table has less than two columns in the body: %s %s" %
                (url, tostring(tr)))
            raise ValueError
        #grp = junws(tds[0]).split()
        for meps in tds[1].xpath(".//p"):
            meps = junws(meps)
            if not meps: continue
            for m in meps.split(','):
                m = unws(m)
                if not m: continue
                mepid = db.getMep(m, date=date)
                if not mepid:
                    log(2, "could not resolve MEP name: %s" % m)
                res['meps'].append(mepid or m)
    return res

示例#11

0

显示文件

文件： amendment.py 项目： parltrack/parltrack

def istype(text):
    # get type
    found=False
    for t in types:
        if unws(text).lower().startswith(t.lower()):
            found=True
            break
    return found

示例#12

0

显示文件

def sidebar_check(root, url):
    sidebar = root.xpath(
        '//div[@id="sectionsNavPositionInitial"]//div[@class="erpl_side-navigation"]/div/ul'
    )
    if len(sidebar) != 1:
        log(1, "sidebar has not 1 element: %s" % url)
        raise ValueError
    for li in sidebar[0].xpath('./li'):
        title = li.xpath('./a/span[@class="t-x"]/text()')
        if len(title) != 1:
            log(1, "title has not 1 element: %s" % url)
            raise ValueError
        title = unws(title[0])
        if title not in known_sidebar:
            log(2, '"%s" not in known_sidebar items, in %s' % (title, url))
        subtitles = li.xpath('.//div/ul/li/a/span[@class="t-x"]/text()')
        for s in subtitles:
            s = unws(s)
            if s not in known_sidebar[title]:
                log(
                    2, '"%s" -> "%s" not in known_sidebar items, in %s' %
                    (title, s, url))

示例#13

0

显示文件

def addchangednames(mep):
    mepid = mep['UserID']
    m = db.get('ep_meps', mepid)
    if not m: return mep
    prevnames = [
        c['data'][0] for changes in m.get('changes', {}).values()
        for c in changes if c['path'] == ['Name', 'full']
    ]
    aliases = set(mep['Name']['aliases'])
    for name in prevnames:
        aliases |= set(mangleName(name, mepid)['aliases'])
    mep['Name']['aliases'] = sorted(
        [x for x in set(unws(n) for n in aliases) if x])
    return mep

示例#14

0

显示文件

def mangleName(name, id):
    sur = []
    family = []
    tmp = name.split(' ')
    title = None
    for i, token in enumerate(tmp):
        if ((token.isupper() and not isabbr(token))
                or token in ['de', 'van', 'von', 'del']
                or (token == 'in' and tmp[i + 1] == "'t")
                or (token[:2] == 'Mc' and token[2:].isupper())):
            family = tmp[i:]
            break
        else:
            sur.append(token)
    sur = u' '.join(sur)
    family = u' '.join(family)
    for t in TITLES:
        if sur.endswith(t):
            sur = sur[:-len(t)]
            title = t
            break
        if sur.startswith(t):
            sur = sur[len(t) + 1:]
            title = t
            break
    res = {u'full': name, u'sur': sur, u'family': family}

    aliases = set(
        [family, name,
         u"%s %s" % (sur, family),
         u"%s %s" % (family, sur)])
    if title:
        res[u'title'] = title
        aliases |= set([(u"%s %s" % (title, family)),
                        (u"%s %s %s" % (title, family, sur)),
                        (u"%s %s %s" % (title, sur, family)),
                        (u"%s %s %s" % (sur, title, family)),
                        (u"%s %s %s" % (sur, family, title)),
                        (u"%s %s %s" % (family, sur, title)),
                        (u"%s %s %s" % (family, title, sur))])
    if id in MEPS_ALIASES:
        aliases |= set(MEPS_ALIASES[id])
    res[u'aliases'] = sorted([x for x in set(unws(n) for n in aliases) if x])
    return res

示例#15

0

显示文件

文件： pvote.py 项目： parltrack/parltrack

def scrape(url, **kwargs):
    log(3,"scraping %s" % (url))
    root = getXML(url)
    if root is None:
        log(1,"could not get votes for", url)
        return # angrily o/
    log(3, "processing plenary votes xml from %s" % url)
    # root is:
    #PV.RollCallVoteResults EP.Number="PE 533.923" EP.Reference="P7_PV(2014)04-17" Sitting.Date="2014-04-17" Sitting.Identifier="1598443"
    votes=[]
    for vote in root.xpath('//RollCallVote.Result'):
        # hrmpf, some EP seriously used the braindead Y-d-m format sometimes in vote timestamps :/
        time = vote.get('Date')
        if len(time.split()) == 2:
            ts = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
        else:
            ts = datetime.strptime(time, "%Y-%m-%d")
        tmp=vote.get('Identifier')
        if tmp:
            voteid = int(tmp)
        else:
            tmp = vote.get('Number')
            if not tmp:
                log(1, "blimey, could not deduce an id for the vote in %s" % url)
                raise ValueError("no id for vote in %s" % url)
            voteid = "%s-%s" % (ts,tmp)
        title = vote.xpath("RollCallVote.Description.Text")
        if len(title) != 1:
            log(2, "holy ambiguity Batman! This vote doesn't have one title, but %d: %d %s" % (len(title), voteid, url))
            title="!unknown!"
        else:
            title=junws(title[0])
        v={u"ts": ts,
           u"url": url,
           u"voteid": voteid,
           u"title": title,
           'votes':{}}
        v.update(votemeta(v['title'], v['ts']))
        if 'epref' not in v:
            ref = vote.xpath("RollCallVote.Description.Text/a/text()")
            if ref:
                v['epref']=unws(ref[0])
        for type, stype in [('Result.For','+'), ('Result.Against','-'), ('Result.Abstention','0')]:
            type = vote.xpath(type)
            if not type: continue
            if len(type)>1:
                log(2, "[pff] more than one %s entry in vote (id:%d) in %s" % (stype, v['voteid'], url))
            type = type[0]
            v['votes'][stype]={'total': int(type.get('Number')),
                               'groups': {}}
            for group in type.xpath('Result.PoliticalGroup.List'):
                g = str(group.get('Identifier'))
                if not g in v['votes'][stype]['groups']:
                    v['votes'][stype]['groups'][g]=[]
                for tag in ['Member.Name', 'PoliticalGroup.Member.Name']:
                    for mep in group.xpath(tag):
                        m = {}
                        name = junws(mep)
                        mepid = mep.get("PersId")
                        if mepid:
                            mepid = int(mepid)
                        else:
                            mepid = db.getMep(name, v['ts'], abbr=g)
                        if mepid:
                            m['mepid']= mepid
                            #if int(mep.get('MepId')) in ambiguous_meps:
                            #    oid = int(mep.get('MepId'))
                            #    ambiguous_meps.remove(oid)
                            #    log(2,'found mepid for previously ambigous obscure_id: "%s": %s' % (oid, mepid))
                        else:
                            mepid = lost_meps.get(mep.get('MepId'))
                            if mepid:
                                m['mepid']= mepid
                            else:
                                m['name']= name
                                m['obscure_id']=int(mep.get('MepId'))  # it's a totally useless and confusing id that is nowhere else used
                        v['votes'][stype]['groups'][g].append(m)
        # save
        process(v, v['voteid'], db.vote, 'ep_votes', v['title'])
        votes.append(v)
    return votes

示例#16

0

显示文件

def scrape(id, terms, mepname, **kwargs):
    activity_types = (
        ('plenary-speeches', 'CRE'),
        ('reports', "REPORT"),
        ('reports-shadow', "REPORT-SHADOW"),
        ('opinions', "COMPARL"),
        ('opinions-shadow', "COMPARL-SHADOW"),
        ('motions-instit', "MOTION"),
        ('oral-questions', "OQ"),
        # other activities
        ('written-explanations', 'WEXP'),
        ('major-interpellations', 'MINT'),
        ('written-questions', "WQ"),
        ('motions-indiv', "IMOTION"),
        ('written-declarations', "WDECL"),
    )
    activities = {}
    for type, TYPE in activity_types:
        for term in terms:
            page = 0
            cnt = 20
            url = "http://www.europarl.europa.eu/meps/en/%s/loadmore-activities/%s/%s/?page=%s&count=%s" % (
                id, type, term, page, cnt)
            try:
                root = fetch(url)
            except:
                log(1, "failed to fetch {}".format(url))
                raise ValueError
                #continue
            #print(url, file=sys.stderr)
            while (len(root.xpath('//div[@class="erpl_document"]')) > 0):
                for node in root.xpath('//div[@class="erpl_document"]'):
                    if type == 'written-explanations':
                        item = {
                            'title':
                            unws(''.join(
                                node.xpath(
                                    './div/h3/span[@class="t-item"]//text()'))
                                 ),
                            'date':
                            datetime.strptime(
                                node.xpath('./div[1]/div[1]/span[1]/text()')
                                [0], u"%d-%m-%Y"),
                            'text':
                            unws(''.join(node.xpath('./div[2]/div//text()')))
                        }
                    elif type == 'written-declarations':
                        if len(node.xpath('./div[1]/div')) != 3:
                            log(
                                2,
                                "written decl item has not 3 divs but %d %s" %
                                (len(node.xpath('./div[1]/div')), url))
                            continue
                        if len(node.xpath('./div[1]/div[1]/span')) != 3:
                            log(
                                2,
                                "written decl item has not 3 but %d spans in the 1st div at %s"
                                %
                                (len(node.xpath('./div[1]/div[1]/span')), url))
                            continue

                        item = {
                            'title':
                            unws(''.join(
                                node.xpath(
                                    './div/h3/span[@class="t-item"]//text()'))
                                 ),
                            'date':
                            datetime.strptime(
                                node.xpath('./div[1]/div[1]/span[1]/text()')
                                [0], u"%d-%m-%Y"),
                            'id':
                            unws(''.join(
                                node.xpath('./div[1]/div[1]/span[2]/text()')
                                [0])),
                            'status':
                            unws(''.join(
                                node.xpath('./div[1]/div[1]/span[3]/text()')
                                [0])),
                            'formats': [{
                                'type':
                                unws(fnode.xpath('./span/text()')[0]),
                                'url':
                                str(fnode.xpath('./@href')[0]),
                                'size':
                                unws(fnode.xpath('./span/span/text()')[0])
                            } for fnode in node.xpath(
                                './div[1]/div[2]/div[@class="d-inline"]/a')],
                            'authors': [{
                                'name': name.strip(),
                                "mepid": db.mepid_by_name(name.strip())
                            } for name in node.xpath(
                                './div[1]/div[3]/span/text()')],
                        }
                        for info in node.xpath('./div[2]/div'):
                            label = unws(''.join(info.xpath('./text()')))[:-2]
                            value = unws(''.join(info.xpath('./span/text()')))
                            if 'date' in label.lower():
                                value = datetime.strptime(value, u"%d-%m-%Y")
                            if label == 'Number of signatories':
                                number, date = value.split(' - ')
                                value = int(number)
                                item["No of sigs date"] = datetime.strptime(
                                    date, u"%d-%m-%Y")
                            item[label] = value
                    else:
                        #from lxml.etree import tostring
                        #print('\n'.join(tostring(e).decode() for e in node.xpath('./div/div[1]')))
                        # all other activities share the following scraper
                        ref = unws(''.join(
                            node.xpath('./div[1]/div[1]/span[2]/text()')))

                        if ref.startswith('- '):
                            ref = ref[2:]
                        if ref.endswith(' -'):
                            ref = ref[:-2]

                        item = {
                            'date':
                            datetime.strptime(
                                node.xpath('./div[1]/div[1]/span[1]/text()')
                                [0], u"%d-%m-%Y"),
                            'reference':
                            ref,
                        }

                        if type not in ['written-questions', 'oral-questions']:
                            ref = unws(''.join(
                                node.xpath('./div[1]/div[1]/span[3]/text()')))
                            if ref:
                                if not pere.match(ref):
                                    log(
                                        2,
                                        "pe, has not expected format: '%s'" %
                                        ref)
                                else:
                                    item['pe'] = ref

                        # opinions don't have title urls... why would they?
                        refurl = node.xpath('./div[1]/h3/a/@href')
                        if refurl: item['url'] = str(refurl[0])

                        item['title'] = unws(''.join(
                            node.xpath(
                                './div/h3//span[@class="t-item"]//text()')))

                        abbr = node.xpath(
                            './div[1]/div[1]/span/span[contains(concat(" ",normalize-space(@class)," ")," erpl_badge-committee ")]/text()'
                        )
                        if len(abbr):
                            item['committee'] = [
                                a for a in [unws(c) for c in abbr] if a
                            ]

                        formats = []
                        for fnode in node.xpath(
                                './div[1]/div[2]/div[@class="d-inline"]/a'):
                            elem = {
                                'type': unws(fnode.xpath('./span/text()')[0]),
                                'url': str(fnode.xpath('./@href')[0])
                            }
                            tmp = fnode.xpath('./span/span/text()')
                            if len(tmp) > 0:
                                elem['size'] = unws(tmp[0])
                            formats.append(elem)
                        if formats:
                            item['formats'] = formats

                        authors = [{
                            'name': name.strip(),
                            "mepid": db.mepid_by_name(name.strip())
                        } for name in node.xpath('./div[1]/div[3]/span/text()')
                                   ]
                        if authors: item['authors'] = authors

                        if type in ['opinions-shadow', 'opinions']:
                            for f in item['formats']:
                                if f['type'] == 'PDF':
                                    ref = pdf2ref(f['url'])
                                    if ref is not None:
                                        item['dossiers'] = [ref]
                                    break
                        else:
                            # try to deduce dossier from document reference
                            dossiers = db.get('dossiers_by_doc',
                                              item['reference']) or []
                            if len(dossiers) > 0:
                                item['dossiers'] = [
                                    d['procedure']['reference']
                                    for d in dossiers
                                ]
                            elif not '+DOC+PDF+' in item['url']:
                                # try to figure out the associated dossier by making an (expensive) http request to the ep
                                log(
                                    4, "fetching primary activity page %s" %
                                    item['url'])
                                try:
                                    refroot = fetch(item['url'])
                                except:
                                    refroot = None
                                if refroot is not None:
                                    if '/doceo/' in item[
                                            'url']:  # stupid new EP site removed the span with the procedure, bastards.
                                        fulla = refroot.xpath(
                                            '//table[@class="buttondocwin"]//a/img[@src="/doceo/data/img/navi_moredetails.gif"]/..'
                                        )
                                        if fulla:
                                            fullurl = fulla[0].get('href')
                                            if fullurl.endswith('.html'):
                                                if fullurl[-7:-5] != 'EN':
                                                    fullurl = fullurl[:-7] + 'EN.html'
                                                log(
                                                    4,
                                                    'loading activity full text page %s'
                                                    % fullurl)
                                                if fullurl.startswith(
                                                        '/doceo'):
                                                    fullurl = 'https://www.europarl.europa.eu' + fullurl
                                                if fullurl != item['url']:
                                                    refroot = fetch(fullurl)
                                        else:
                                            log(
                                                4, 'no fulla for %s' %
                                                item['url'])
                                    anchor = refroot.xpath(
                                        '//span[@class="contents" and text()="Procedure : " and not(ancestor::div[@style="display:none"])]'
                                    )
                                    if len(anchor) == 1:
                                        dossier = anchor[0].xpath(
                                            "./following-sibling::a/text()")
                                        if len(dossier) == 1:
                                            item['dossiers'] = [
                                                unws(dossier[0])
                                            ]
                                        elif len(dossier) > 1:
                                            log(
                                                2,
                                                "more than one dossier in ep info page: %d %s"
                                                % (len(dossier), item['url']))
                                    elif len(anchor) > 1:
                                        log(
                                            2,
                                            "more than one anchor in ep info page: %d %s"
                                            % (len(anchor), item['url']))

                    item['term'] = term
                    if TYPE not in activities:
                        activities[TYPE] = []
                    activities[TYPE].append(item)
                if len(root.xpath('//div[@class="erpl_document"]')) < cnt:
                    break
                page += 1
                url = "http://www.europarl.europa.eu/meps/en/%s/loadmore-activities/%s/%s/?page=%s&count=%s" % (
                    id, type, term, page, cnt)
                try:
                    root = fetch(url)
                except:
                    log(1, "failed to fetch {}".format(url))
                    #raise ValueError
                    break
                #print(url, file=sys.stderr)
        if TYPE in activities:
            activities[TYPE] = sorted(activities[TYPE],
                                      key=lambda x: x['date'])
    activities['mep_id'] = id
    if len(activities.keys()) > 1:
        process(activities,
                id,
                db.activities,
                'ep_mep_activities',
                mepname,
                nodiff=True)
        return activities
    return {}

示例#17

0

显示文件

def scrape(url, committee, **kwargs):
    comid = committee
    root = fetch(url)
    lines = [
        x for x in root.xpath('//td[@class="contents"]/div/*')
        if unws(' '.join(x.xpath('.//text()')))
    ]
    lines = [
        x for x in lines if unws(' '.join(x.xpath('.//text()'))) not in
        ['<EPHeader>', '</EPHeader>']
    ]
    if not len(lines): return
    if not unws(' '.join(lines[2].xpath('.//text()'))) in [
            'DRAFT AGENDA', '<TitreType> DRAFT AGENDA </TitreType>'
    ]:
        log(
            3, "not DRAFT AGENDA %s in %s" %
            (unws(' '.join(lines[2].xpath('.//text()'))), url))
    agenda = {
        u'committee': comid,
        u'committee_full': unws(' '.join(lines[0].xpath('.//text()'))),
        u'src': url,
    }
    i = 1
    if unws(' '.join(lines[3].xpath(
            './/text()'))) == "INTERPARLIAMENTARY COMMITTEE MEETING":
        log(2, "skipping interparl com meet")
        return
    if len(lines) >= 7 and unws(' '.join(
            lines[6].xpath('.//text()'))).startswith('Room'):
        agenda.update({
            u'docid':
            unws(' '.join(lines[1].xpath('.//text()'))),
            u'type':
            unws(' '.join(lines[3].xpath('.//text()'))),
            u'time':
            toTime(unws(' '.join(lines[4].xpath('.//text()')))),
            u'city':
            unws(' '.join(lines[5].xpath('.//text()'))),
            u'room':
            unws(' '.join(lines[6].xpath('.//text()')))[6:],
        })
        i = 7
    itemcnt = 0
    item = {}
    schedule = None
    res = []
    while i < len(lines):
        line = lines[i]
        i += 1
        txt = unws(' '.join(line.xpath('.//text()')))
        if txt in ['* * *', '***']:
            continue  # skip end of schedule block

        # 20 December 2011, 16.00 – 16.30
        tmp = toTime(txt)
        if tmp:
            schedule = tmp
            if i < len(lines) and unws(' '.join(
                    lines[i].xpath('.//text()'))) == 'In camera':
                schedule[u'incamera'] = True
                i += 1
            continue

        if line.tag == 'div':
            item[u'actors'] = getactors(line)
            continue
        firsttoken = txt.split()[0]
        # 6. Alternative dispute resolution for consumer disputes and
        #    amending Regulation (EC) No 2006/2004 and Directive
        #    2009/22/EC (Directive on consumer ADR)
        if firsttoken[-1] == '.' and firsttoken[:-1].isdigit(
        ) and itemcnt + 1 == int(firsttoken[:-1]):
            if item: res.append(item)
            itemcnt += 1
            item = copy.deepcopy(agenda)
            item.update({
                u'title': ' '.join(txt.split()[1:]),
                u'seq_no': itemcnt,
            })
            if schedule:
                item.update(schedule)
            continue
        # trailing list of "details"
        # · Presentation by the Commission of the proposal & Impact Assessment
        # · Exchange of views
        if firsttoken == u"·":
            if not 'list' in item: item[u'list'] = []
            tmp = ' '.join(txt.split()[1:])
            if tmp.startswith('Deadline for tabling amendments:'):
                try:
                    item[u'tabling_deadline'] = datetime.strptime(
                        tmp.split(':')[1].strip(), "%d %B %Y, %H.%M")
                except ValueError:
                    try:
                        item[u'tabling_deadline'] = datetime.strptime(
                            tmp.split(':')[1].strip(), "%d.%m.%Y at %H.%M")
                    except:
                        log(
                            2, '[$] unknown tabling deadline format %s' %
                            unws(tmp))
            item[u'list'].append(tmp)
            continue
        # committee dossier
        # IMCO/7/08130
        if txt.startswith("%s/7/" % comid) and len(txt) == 12:
            item[u'comdossier'] = txt
            continue
        # ***I    2011/0373(COD)       COM(2011)0793 – C7-0454/2011
        tmp = getdocs(txt)
        if tmp:
            item.update(tmp)
            continue
        # fall-through line
        log(4, "(falltrough) %s %s" % (line.tag, txt.encode('utf8')))
    if item: res.append(item)
    save(res)
    return res

示例#18

0

显示文件

文件： ep_meps.py 项目： parltrack/parltrack

def parseMember(userid):
    url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])

    data = {
        u'active': False,
        u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)),
        u'meta': {u'url': url}
        }

    mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()'))))

    borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()')
    if len(borntxt)>0:
        if unws(borntxt[-1]).startswith('Date of death:'):
            try:
                data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y")
            except ValueError:
                logger.warn('[!] failed to scrape birth data %s' % url)
                logger.warn(traceback.format_exc())
            tmp = borntxt[-2].split(',', 1)
        else:
            tmp = borntxt[-1].split(',', 1)
        if len(tmp)==2:
            (d, p) = tmp
        else:
            d,p = tmp[0], None
        try:
            data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")}
        except ValueError:
            logger.warn(traceback.format_exc())
        finally:
            if p:
                if 'Birth' in data:
                    data[u'Birth'][u'place'] = unws(p)
                else:
                    data[u'Birth'] = unws(p)
    else:
        logger.warn('[!] no birth data %s' % url)

    # scrape stuff from right column
    addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8')
                       for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')])
    addif(data,u'Homepage',[x.get('href')
                            for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')])
    addif(data,u'Twitter',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')])
    addif(data,u'Facebook',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')])
    addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1]
                        for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')])
    # contact information
    for span in root.xpath('//div[@id="content_right"]//h3'):
        title=unws(''.join(span.xpath('.//text()')))
        if title == "Contacts":
            addif(data,u'Addresses',getAddress(span))

    # scrape main content
    for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'):
        key=unws(''.join(section.xpath('.//text()')))
        if key=="National parties":
            # constituencies
            key='Constituencies'
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, party = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if not key in data: data[key]=[]
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                cstart = party.rfind(' (')
                if party[cstart+2:-1] in SEIRTNUOC:
                    country = party[cstart+2:-1]
                    party = party[:cstart]
                else:
                    logger.warn('unknown country: %s' % party[cstart+2:-1])
                    country='unknown'
                #print etree.tostring(constlm, pretty_print=True)
                data[key].append({
                    u'party':     party,
                    u'country':   country,
                    u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                    u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                    })
        elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor', 'Substitute observer']:
            # memberships in various committees, delegations and EP mgt
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, org = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                item={u'role': key,
                      u'abbr': COMMITTEE_MAP.get(org),
                      u'Organization': org,
                      u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                      u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                      }
                for start, field in orgmaps:
                    if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start):
                        if not field in data: data[field]=[]
                        if field=='Committees' and item['Organization'] in COMMITTEE_MAP:
                            item[u'committee_id']=COMMITTEE_MAP[item['Organization']]
                        data[field].append(item)
                        break
        elif key == u'Political groups':
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                interval, org = line.split(' : ',1)
                tmp = org.split(u' - ')
                if len(tmp)>1:
                    org = ' - '.join(tmp[:-1])
                    role = tmp[-1]
                elif org.endswith(' -'):
                        org=org[:-2]
                        role=''
                else:
                    logger.error('[!] political group line %s' % line)
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                if not u'Groups' in data: data[u'Groups']=[]
                data[u'Groups'].append(
                    {u'role':         role,
                     u'Organization': org,
                     u'country':      COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))),
                     u'groupid':      group_map[org],
                     u'start':        datetime.strptime(unws(start), u"%d.%m.%Y"),
                     u'end':          datetime.strptime(unws(end), u"%d.%m.%Y"),
                     })
        else:
            logger.error('[!] unknown field %s' % key)

    # sort all lists in descending order
    for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']:
        if not fld in data: continue
        data[fld]=sorted(data[fld],
                         key=lambda x: x.get('end',x['start']),
                         reverse=True)

    # get CV - page (is on separate http path :/)
    cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid
    root = fetch(cvurl, ignore=[500])
    data[u'CV']={}
    for sec in root.xpath('//h3[@class="collapsible"]'):
        section=unws(''.join(sec.xpath('.//text()')))
        data[u'CV'][section]=[]
        for line in sec.xpath('./following-sibling::div[1]//li'):
            data[u'CV'][section].append(unws(''.join(line.xpath('.//text()'))))


    # get assistants also on a separate page :/
    assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid
    root = fetch(assurl, ignore=[500])
    for h3 in root.xpath('//h3[@id="section"]'):
        title=unws(''.join(h3.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower().split()[0],
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])
        elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)',
                       'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower(),
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])

    return data

示例#19

0

显示文件

文件： amendment.py 项目： parltrack/parltrack

def scrape(url, meps=None, **kwargs):
    prolog=True
    res=[]
    block=None
    reference=None
    date=None
    committee=[]
    text, PE=getraw(url)
    motion = False
    for line in text:
        #log(4,'line is: "%s"' % line)
        if prolog:
            line=unws(line)
            if not line: continue

            if amstart.match(line):
                if PE is None:
                    log(1, "document has no PE id: %s" % url)
                if reference==None:
                    log(1,"[!] couldn't find ref: %s" % (unws([x for x in text[:20] if unws(x)][2])))
                    # marking as scraped though
                    if not motion:
                        log(1, "couldn't find dossier reference in source pdf: %s" % url)
                        #raise ValueError("No dossier reference in amendment: %s" % url)
                        return
                    log(3, "couldn't find dossier reference in source pdf, but was marked as motion: %s" % url)
                    return
                if date==None or committee==[]:
                    log(1,"[!] couldn't find date or committee: %s" % url)
                    raise ValueError("No date or committee in amendment")
                block=[line]
                prolog=False
                continue

            if line == 'Draft motion for a resolution': 
                log(4,"document is a draft motion for resolution")
                motion = True

            m = re.search(pere, line)
            if m:
                if PE is None: PE = m.group(0)
                log(4,"found PE reference: %s" % PE)
                line = unws(line.replace(PE,''))
                log(4,'updated line is: "%s"' % line)

            if line in COMMITTEE_MAP:
                log(4,'found committee: "%s"' % line)
                committee.append(COMMITTEE_MAP[line])
                continue

            m = re.search(refre, line)
            if (committee and not reference and m):
                reference=m.group(1)
                log(4,'found reference: "%s"' % reference)
                if url == 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-506.166%2b03%2bDOC%2bPDF%2bV0%2f%2fEN':
                    log(3, "adjusting reference to eudatap")
                    reference="2012/0011(COD)"
                continue

            if (not date):
                try:
                    date = parse(unws(line), dayfirst=True)
                    log(4,'found date: "%s"' % line)
                except ValueError:
                    pass
                except TypeError:
                    pass
            continue

        if amstart.match(line):
            # parse block
            am=parse_block(block, url, reference, date, committee, meps, PE)
            if am is not None:
                process(am, am['id'], db.amendment, 'ep_amendments', am['reference']+' '+am['id'], nodiff=True)
                res.append(am)
            block=[line]
            continue
        block.append(line)
    if block and filter(None,block):
        am = parse_block(block, url, reference, date, committee, meps, PE)
        if am is not None:
            process(am, am['id'], db.amendment, 'ep_amendments', am['reference']+' '+am['id'], nodiff=True)
            res.append(am)
    log(3,"total amendments %d in %s" % (len(res),url))
    return res

示例#20

0

显示文件

文件： amendment.py 项目： parltrack/parltrack

def parse_block(block, url, reference, date, committee, rapporteur, PE):
    am={u'src': url,
        u'peid': PE,
        u'reference': reference,
        u'date': date,
        u'committee': committee}

    #logger.info(block)
    # get title
    try:
        am[u'seq']=int(unws(block[0]).split()[1])
    except ValueError:
        am[u'seq']=unws(block[0]).split()[1]
    except IndexError:
        log(2,"wrong seq %s" % (block[0]))
        am[u'seq']=unws(block[0])
    del block[0]

    pefix = PE.split('v')[0] # we strip of the v0[0-9]-[0-9]{1,2} part of the PEID
    am['id']="%s-%s" % (pefix,am['seq'])

    strip(block)

    # find and strip justification
    i=len(block)-1
    while i>2 and not (unws(block[i])=="Justification" and block[i].startswith(' ' * 6)):
        i-=1
    if i>2:
        if i<len(block)-1 and (not unws(block[i+1]) or not block[i+1].startswith(' ') ):
            am['justification']='\n'.join(block[i+2:])
            del block[i:]
            strip(block)
        else:
            log(2, 'wrong justification in %s: "%s"' % (am['seq'], '\\n'.join(block[i:])))

    # get original language
    if 4<len(unws(block[-1]))<=6 and unws(block[-1]).startswith('Or.'):
        am['orig_lang']=unws(block[-1])[4:]
        del block[-1]
        strip(block)

    # find split column new/old heading
    i=len(block)-1
    while (i>2 and
           not ((block[i].endswith("     Amendment") or
                 block[i].endswith("     PARTICULARS") or
                 block[i].endswith("     Remedy") or
                 block[i].endswith("     Amended text") or
                 block[i].endswith("     Amendement") or
                 block[i].endswith("           Amendments by Parliament") or
                 block[i].endswith("           Proposal for rejection") or
                 block[i].endswith("           Proposal for a rejection") or
                 block[i].endswith("           Does not affect English version") or
                 block[i].endswith("           (Does not affect English version)") or
                 block[i].endswith("      Amendment by Parliament")) and
                len(block[i])>33) and
           not (unws(block[i])=='Text proposed by the Commission' or
                unws(block[i]) in types)):
        i-=1
    if i>2:
        #if block[i].endswith("               Proposal for rejection"):
        #    pass # location will be possibly '-'
        seq=False
        if unws(block[i]) in ["Amendment", "Amendment by Parliament"]:
            # sequential format, search for preceeding original text
            j=i
            while (j>2 and not (unws(block[j]) in types or unws(block[j])=='Text proposed by the Commission')):
                j-=1
            if j>2: i=j
            seq=True; key='old'
        elif unws(block[i])=='Text proposed by the Commission' or block[i].strip() in types:
            seq=True; key='old'
        # throw headers
        del block[i]
        while i<len(block) and not unws(block[i]): del block[i]        # skip blank lines
        mid=max([len(x) for x in block])//2
        while i<len(block):
            if seq:
                if unws(block[i]) in ["Amendment", "Amendment by Parliament", "Text Amended"]:
                    key='new'
                    del block[i]
                    continue
                try: am[key].append(block[i])
                except KeyError: am[key]=[block[i]]
                del block[i]
                continue
            # only new, old is empty
            if block[i].startswith('         '):
                try: am['new'].append(unws(block[i]))
                except KeyError: am['new']=[unws(block[i])]
                del block[i]
                continue
            newstart = block[i].rstrip().rfind('  ')
            # only old, new is empty
            if newstart < 6:
                try: am['old'].append(unws(block[i]))
                except KeyError: am['old']=[unws(block[i])]
                del block[i]
                continue
            #mid=len(block[i])/2
            #mid=40
            lsep=block[i].rfind('  ', 0, mid)
            rsep=block[i].find('  ', mid)
            sep=None
            if abs(lsep-mid)<abs(rsep-mid):
                if abs(lsep-mid)<15:
                    sep=lsep
            else:
                if abs(rsep-mid)<15:
                    sep=rsep
            if sep:
                try: am['old'].append(unws(block[i][:sep]))
                except KeyError: am['old']=[unws(block[i][:sep])]
                try: am['new'].append(unws(block[i][sep:]))
                except KeyError: am['new']=[unws(block[i][sep:])]
            else:
                # no sane split found
                #logger.warn("no split: %s %s\n%s" % (datetime.now().isoformat(),
                #                                     (sep, mid, len(block[i]), newstart, block[i]),
                #                                     block[i][mid-1:mid+2]))
                # fallback to naive splitting
                try: am['old'].append(unws(block[i][:newstart]))
                except KeyError: am['old']=[unws(block[i][:newstart])]
                try: am['new'].append(unws(block[i][newstart:]))
                except KeyError: am['new']=[unws(block[i][newstart:])]
            del block[i]
        strip(block)
    else:
        if not 'Does not affect English version.' in block[i:]:
            log(2, "no table\n%s" % ('\n'.join(block[i:])))
            return None
            #am['content']=block[i:]
            #return am

    i=0
    # find end of authors
    while (i<len(block) and
           unws(block[i]) and
           not unws(block[i]).lower().startswith('compromise') and
           not istype(block[i]) and
           not unws(block[i]).split()[0] in locstarts): i+=1
    if i<len(block):
        if i>0:
            names=' '.join(block[:i])
            am['authors']=names
            #logger.info("names \n%s" % names)

            # convert to pt mep _ids
            for text in filter(None,splitNames(names)):
                mepid=db.getMep(text,date)
                if mepid:
                    try: am['meps'].append(mepid)
                    except KeyError: am['meps']=[mepid]
                else:
                    log(3, "fix %s" % text)
            del block[:i]
            strip(block)
        elif rapporteur:
            am['authors']=rapporteur
            if isinstance(rapporteur,list):
                for text in rapporteur:
                    mepid=db.getMep(text,date)
                    if mepid:
                        try: am['meps'].append(mepid)
                        except KeyError: am['meps']=[mepid]
                    else:
                        log(3, "fix %s" % text)
            else:
                for text in filter(None,splitNames(rapporteur)):
                    mepid=db.getMep(text,date)
                    if mepid:
                        try: am['meps'].append(mepid)
                        except KeyError: am['meps']=[mepid]
                    else:
                        log(3, "fix %s" % text)
        else:
            log(3, "no authors in Amendment %s %s" % (am['seq'], url))
    else:
        log(2, "no boundaries in Amendment %s %s\n%s" % (am['seq'], url,
                                                      '\n'.join(block)))
        am['rest']=block
        return am

    # handle compromise info
    i=0
    while (i<len(block) and
           unws(block[i]) and
           not istype(block[i]) and
           not unws(block[i]).split()[0] in locstarts): i+=1
    if i<len(block) and i>0:
        if [unws(x) for x in block[:i]]!=["Draft proposal for a recommendation"]:
            am['compromise']=block[:i]
        del block[:i]
        strip(block)

    i=0
    while (i<len(block) and unws(block[i])):
        if unws(block[i]).split()[0] in locstarts:
            try: am['location'].append((' '.join(block[:i]),unws(block[i])))
            except KeyError: am['location']=[(' '.join(block[:i]),unws(block[i]))]
            del block[:i+1]
            i=0
        else:
            i+=1
    if len(block)>0 and ((len(block)==1 or
                          not unws(block[1])) and
                         unws(block[0])!='1' and
                         'location' in am):
        am['location'][-1]=(am['location'][-1][0],"%s %s" % (am['location'][-1][1],block[0]))
        del block[0]
        strip(block)

    if block:
        if not ((len(block)==3 and
                unws(block[0])=='1' and
                not unws(block[1]) and
                block[2].startswith("  ")) or
                (len(block)==2 and
                unws(block[0])=='1' and
                block[1].startswith("  "))):
            # ignore obvious footnotes
            log(3, "rest in Amendment %s\n%s" % (am['seq'],'\n'.join(block)))
    return am

示例#21

0

显示文件

文件： amendment.py 项目： parltrack/parltrack

def strip(block):
    while len(block) and not unws(block[0]):
        del block[0]
    while len(block) and not unws(block[-1]):
        del block[-1]

示例#22

0

显示文件

def getactors(node):
    res = {}
    ax = [None, []]
    for row in node.xpath('.//tr'):
        cells = row.xpath('./td/p')
        if not cells: continue

        # get role Rapporteur|Responsible|Rapporteur for the opinion|Opinions
        role = cells[0].xpath('text()')
        if role and unws(role[0]):
            #print(ax[1])
            if ax[0] and ax[1]:
                res[ax[0]] = sorted(ax[1], key=lambda x: x.get('name', ''))
            tmp = unws(role[0])[:-1]
            if tmp == "Rapporteur for the opinion":
                tmp = "Rapporteur"
            ax = [tmp, []]

        tmp = unws((cells[1].xpath('text()') or [''])[0])
        if ax[0] in ["Rapporteur", "Rapporteur for the opinion"] and tmp:
            name = ' '.join(tmp.split()[:-1])
            item = {
                u'group': tmp.split()[-1][1:-1],
                u'name': name,
                u'mepref': getMEPRef(name)
            }
            if len(cells) > 2:
                item[u'docs'] = getdoclist(cells[2])
            ax[1].append(item)
            continue
        if ax[0] in ["Opinions", "Responsible"] and tmp:
            tmp1 = tmp.split(u' –', 1)
            if len(tmp1) == 2:
                (comid, rest) = tmp1
            elif len(tmp1) == 1:
                if len(tmp1[0]) == 4 and tmp1[0].isupper():
                    (comid, rest) = (tmp1, '')
                elif len(tmp1[0]) > 4 and tmp1[0][4] in [
                        '-', u'–', u':', u'*'
                ] and tmp1[0][:4].isupper():
                    (comid, rest) = (tmp1[:4], tmp1[5:])
                else:
                    skip = False
                    for com in tmp.split(', '):
                        if com in COMMITTEE_MAP and len(com) == 4:
                            ax[1].append({u'comid': com})
                            skip = True
                    if skip:
                        continue
            else:
                log(2, "[!] unknown committee: %s" % tmp)
                raise
            if not comid:
                log(2, "[!] unknown committee: %s" % tmp)
            item = {u'comid': comid}
            if rest == ' Decision: no opinion':
                item[u'response'] = u'Decision: no opinion'
            if not rest and len(comid) > 4:
                for com in comid.split(', '):
                    ax[1].append({u'comid': com})
                continue
            if len(cells) > 2:
                tmp = unws((cells[2].xpath('text()') or [None])[0])
                if tmp:
                    name = ' '.join(tmp.split()[:-1])
                    item.update({
                        u'group': tmp.split()[-1][1:-1],
                        u'name': name,
                        u'mepref': getMEPRef(name)
                    })
                    if len(cells) > 3:
                        item[u'docs'] = getdoclist(cells[3])
            ax[1].append(item)
    if ax[0] and ax[1]:
        #print(ax[0], ax[1])
        #res[ax[0]]=ax[1]
        res[ax[0]] = sorted(ax[1], key=lambda x: x.get('name', ''))
    return res

示例#23

0

显示文件

def scrape(id, **kwargs):
    # we ignore the /meps/en/<id>/<name>/home path, since we can get all info also from other pages
    url = "http://www.europarl.europa.eu/meps/en/%s/name/cv" % id
    xml = fetch_raw(url)  # we have to patch up the returned html...
    xml = xml.replace("</br>", "<br/>")  # ...it contains some bad tags..
    root = fromstring(
        xml
    )  # ...which make the lxml soup parser drop some branches in the DOM
    sidebar_check(root, url)

    mep = {
        'UserID':
        id,
        'Name':
        mangleName(
            unws(' '.join(
                root.xpath('//span[@class="sln-member-name"]/text()'))), id),
        'Photo':
        "https://www.europarl.europa.eu/mepphoto/%s.jpg" % id,
        'meta': {
            'url': url
        },
        'Twitter': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Twitter"]/@href'
            )
        ],
        'Homepage': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Website"]/@href'
            )
        ],
        'Facebook': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Facebook"]/@href'
            )
        ],
        'Instagram': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Instagram"]/@href'
            )
        ],
        'Mail': [
            deobfus_mail(x) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="E-mail"]/@href'
            )
        ],
        'Addresses':
        parse_addr(root),
        'active':
        False,
    }

    mep = addchangednames(mep)

    birthdate = root.xpath('//time[@id="birthDate"]/text()')
    if len(birthdate) > 0:
        mep['Birth'] = {
            'date': datetime.strptime(unws(birthdate[0]), u"%d-%m-%Y")
        }
        place = root.xpath('//time[@id="birthDate"]/following-sibling::text()')
        if len(place) > 0:
            tmp = unws(' '.join(place))
            if tmp.startswith(", "): tmp = tmp[2:]
            mep['Birth']['place'] = tmp

    death = root.xpath('//time[@id="deathDate"]/text()')
    if death:
        mep['Death'] = datetime.strptime(unws(death[0]), u"%d-%m-%Y")

    body = root.xpath(
        '//span[@id="detailedcardmep"]/following-sibling::section')[0]

    if body.xpath('.//h1[text()="Curriculum vitae "]'):
        if not body.xpath('.//h3[@id="no_cv_available"]'):
            mep['CV'] = {
                'updated':
                datetime.strptime(
                    unws(
                        body.xpath(
                            './/p[@class="small"]/strong[contains(text(),"Updated: ")]/text()'
                        )[0]), u"Updated: %d/%m/%Y")
            }
            mep['CV'].update({
                unws(''.join(title.xpath(".//text()"))): [
                    unws(''.join(item.xpath(".//text()"))).replace(
                        "-...", "- ...")
                    for item in title.xpath("following-sibling::ul/li")
                ]
                for title in body.xpath('.//h4')
                #if not unws(''.join(title.xpath(".//text()"))).startswith("Original version : ")
            })

    # assistants
    url = "http://www.europarl.europa.eu/meps/en/%s/name/assistants" % id
    root = fetch(url)
    body = root.xpath(
        '//span[@id="detailedcardmep"]/following-sibling::section')[0]
    if unws(' '.join(body.xpath(".//h1/text()"))) == "Assistants":
        for h4 in body.xpath('.//h4'):
            title = unws(''.join(h4.xpath(".//text()")))
            assistants = [
                unws(''.join(item.xpath(".//text()")))
                for item in h4.xpath("../div//span")
            ]
            if title in ['Accredited assistants', 'Local assistants']:
                if not 'assistants' in mep: mep['assistants'] = {}
                title = title.lower().split()[0]
                if assistants: mep['assistants'][title] = assistants
            elif title in [
                    'Accredited assistants (grouping)',
                    'Local assistants (grouping)', 'Service providers',
                    'Trainees', 'Paying agents (grouping)', 'Paying agents',
                    'Assistants to the Vice-Presidency/to the Quaestorate'
            ]:
                if not 'assistants' in mep: mep['assistants'] = {}
                title = title.lower()
                if assistants: mep['assistants'][title] = assistants
            else:
                log(2,
                    'unknown title for assistants "{}" {}'.format(title, url))
                raise ValueError

    # declarations
    root = fetch("http://www.europarl.europa.eu/meps/en/%s/name/declarations" %
                 id)
    body = root.xpath(
        '//span[@id="detailedcardmep"]/following-sibling::section')[0]
    if unws(' '.join(body.xpath(".//h1/text()"))) == "Declarations":
        for title in body.xpath('.//h4'):
            key = unws(''.join(title.xpath('.//text()')))
            if key == 'Declaration of financial interests':
                key = 'Financial Declarations'
                mep[key] = []
                for pdf in title.xpath('./following-sibling::ul/li/a'):
                    url = pdf.xpath('./@href')[0]
                    try:
                        mep[key].append(findecl.scrape(url))
                    except:
                        log(1, "failed to extract findecl from %s" % url)
            elif key == 'Declarations of participation by Members in events organised by third parties':
                key = 'Declarations of Participation'
                mep[key] = []
                for pdf in title.xpath(
                        './following-sibling::ul/li/a'
                )[::
                  -1]:  # reversed order, otherwise newer ones get prepended and mess up the diff
                    url = pdf.xpath('./@href')[0]
                    name = unws(''.join(pdf.xpath('.//text()')))
                    mep[key].append({'title': name, 'url': url})
            elif key in [
                    'Declaration of good conduct',
                    'Voluntary confirmation on the use of the General Expenditure Allowance'
            ]:
                mep[key] = []
                for pdf in title.xpath(
                        './following-sibling::ul/li/a'
                )[::
                  -1]:  # reversed order, otherwise newer ones get prepended and mess up the diff
                    url = pdf.xpath('./@href')[0]
                    name = unws(''.join(pdf.xpath('.//text()')))
                    mep[key].append({'title': name, 'url': url})
            else:
                log(
                    2,
                    'unknown type of declaration: "%s" http://www.europarl.europa.eu/meps/en/%s/name/declarations'
                    % (key, id))
                key = None
                raise ValueError

    # history
    parse_history(id, root, mep)
    process(mep,
            id,
            db.mep,
            'ep_meps',
            mep['Name']['full'],
            nopreserve=(['Addresses'], ['assistants']),
            onchanged=onchanged)

    if __name__ == '__main__':
        return mep
    del mep

示例#24

0

显示文件

def parse_history(id, root, mep):
    for term in root.xpath(
            '//div[@id="sectionsNavPositionInitial"]//div[@class="erpl_side-navigation"]/div/ul/li//span[text()="History of parliamentary service"]/../following-sibling::div//ul/li//a/span[@class="t-x"]/text()'
    ):
        if not term.endswith("parliamentary term"):
            log(
                2,
                'history menu item does not end as expected with "parliamentary term": %s http://www.europarl.europa.eu/meps/en/%s/name/declarations'
                % (term, id))
            raise ValueError
            #continue
        term = int(term[0])
        if (id, term) in {(124870, 9), (129141, 9)}:
            continue  # jeppe kofod, and frans timmermanns never really got started.
        root = fetch(
            "http://www.europarl.europa.eu/meps/en/%s/name/history/%s" %
            (id, term))
        body = root.xpath('//div[@id="status"]')[0]
        for title in body.xpath('.//h4'):
            key = unws(''.join(title.xpath('.//text()')))
            if key in [None, '']:
                log(
                    2,
                    "empty history section http://www.europarl.europa.eu/meps/en/%s/name/history/%s"
                    % (id, term))
                raise ValueError
                #continue
            #mep[key] = []
            for item in title.xpath('./following-sibling::ul/li'):
                interval = unws(''.join(item.xpath('./strong/text()')))
                post = item.xpath('./strong/following-sibling::text()')[0][3:]
                if key in ["National parties", "Constituencies"]:
                    key = 'Constituencies'
                    # parse date interval
                    try:
                        start, end = parse_hist_date(interval)
                    except:
                        log(
                            1,
                            "illegal date interval: %s http://www.europarl.europa.eu/meps/en/%s/name/history/%s"
                            % (interval, id, term))
                        raise ValueError
                        #continue
                    # parse party and country
                    cstart = post.rfind(' (')
                    if post[cstart + 2:-1] in SEIRTNUOC:
                        country = post[cstart + 2:-1]
                        party = post[:cstart]
                    else:
                        log(
                            2, '%s unknown country: %s' %
                            (id, post[cstart + 2:-1]))
                        raise ValueError
                        party = 'unknown'
                        country = 'unknown'
                    if not key in mep: mep[key] = []
                    mep[key].append({
                        u'party': party,
                        u'country': country,
                        u'start': start,
                        u'end': end,
                        'term': term
                    })
                    if end == datetime.strptime("31.12.9999", u"%d.%m.%Y"):
                        mep['active'] = True
                elif key in [
                        'Member', 'Substitute', 'Chair', 'Vice-Chair',
                        'Co-President', 'President', 'Vice-President',
                        'Observer', 'Quaestor', 'Substitute observer'
                ]:
                    # memberships in various committees, delegations and EP mgt
                    try:
                        start, end = parse_hist_date(interval)
                    except:
                        log(
                            2,
                            "illegal date interval: %s http://www.europarl.europa.eu/meps/en/%s/name/history/%s"
                            % (interval, id, term))
                        raise ValueError
                        #continue
                    item = {
                        u'role': key,
                        u'Organization': unws(post),
                        u'start': start,
                        u'end': end,
                        u'term': term,
                    }
                    for start, field in ORGMAPS:
                        if item['Organization'].startswith(start):
                            if field == 'Committees':
                                if item['Organization'] in COMMITTEE_MAP:
                                    item[u'abbr'] = COMMITTEE_MAP[
                                        item['Organization']]
                                else:
                                    log(
                                        5, "no abbr found for committee: %s" %
                                        item['Organization'])
                            if field == 'Delegations':
                                if item['Organization'] in DELEGATIONS:
                                    item[u'abbr'] = DELEGATIONS[
                                        item['Organization']]
                                else:
                                    log(
                                        5, "no abbr found for delegation: %s" %
                                        item['Organization'])
                            if not field in mep: mep[field] = []
                            mep[field].append(item)
                            break
                elif key == u'Political groups':
                    try:
                        start, end = parse_hist_date(interval)
                    except:
                        log(
                            1,
                            "illegal date interval: %s http://www.europarl.europa.eu/meps/en/%s/name/history/%s"
                            % (interval, id, term))
                        raise ValueError
                        #continue
                    tmp = post.split(u' - ')
                    if len(tmp) > 1:
                        org = ' - '.join(tmp[:-1])
                        role = tmp[-1]
                    elif post.endswith(' -'):
                        org = post[:-2]
                        role = ''
                    elif post in ['Non-attached Members', 'Non-attached']:
                        org = post
                        role = 'Member'
                    else:
                        log(
                            2,
                            '[!] political group line "%s", http://www.europarl.europa.eu/meps/en/%s/name/history/%s'
                            % (post, id, term))
                        raise ValueError
                        #continue
                    if not u'Groups' in mep: mep[u'Groups'] = []
                    if not org in GROUP_MAP:
                        log(5, "no groupid found for group: %s" % org)
                    mep[u'Groups'].append({
                        u'role': role,
                        u'Organization': org,
                        # u'country':      country, # this value is missing from the latest EP website
                        u'groupid': GROUP_MAP.get(org, org),
                        u'start': start,
                        u'end': end,
                    })
                else:
                    log(
                        2,
                        '[!] unknown field "%s" http://www.europarl.europa.eu/meps/en/%s/name/history/%s'
                        % (key, id, term))
                    raise ValueError

    # reorder historical lists in ascending order, so new entries are appended and don't mess up the diffs
    for k in ('Constituencies', 'Groups', 'Committees', 'Delegations',
              'Staff'):
        if not k in mep: continue
        mep[k] = [
            e
            for e in sorted(mep[k],
                            key=lambda x: (x['start'], x[
                                'end'], x.get('Organization', x.get('party'))))
        ]

示例#25

0

显示文件

文件： oeil.py 项目： parltrack/parltrack

def scrape_epagents(table):
    heading=''.join(table.xpath('.//td[@class="players_committee"]')[0].xpath(".//text()")).strip()
    responsible=None
    if heading in [ "Committee responsible", "Former committee responsible"]:
        responsible=True
    elif heading in ["Committee for opinion", "Former committee for opinion"]:
        responsible=False
    else:
        logger.warn(u"[!] unknown committee heading %s" % heading)

    # handle shadows
    shadowelems=table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a')
    tips=[t.xpath('text()')[0]
          if len(t.xpath('text()'))>0
          else
              groupurlmap[t.xpath("a")[0].get('href')]
              if len(t.xpath("a"))>0
              else groupurlmap[t.xpath("img")[0].get('src')]
          for t in table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]')]
    shadows={}
    for shadow, group in izip_longest(shadowelems, tips):
        committee=shadow.xpath('./ancestor::td/preceding-sibling::td//acronym/text()')[0]
        if not committee in shadows: shadows[committee]=[]
        if group=='NI': group=u'NI'
        mep={u'name': unicode(shadow.xpath('text()')[0]),
             u'group': unicode(group)}
        tmp=getMEPRef(shadow.xpath('text()')[0])
        if tmp:
           mep[u'mepref']=tmp
        #else:
        #    raise IndexError
        shadows[committee].append(mep)
    # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects
    for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'):
        parent=todel.xpath('..')[0]
        parent.remove(todel.xpath('following-sibling::div')[0])
        parent.remove(todel)

    # handle each row of agents
    agents=[]
    for agent in lst2obj(table,epagents,1):
        agent[u'responsible']=responsible
        agent[u'body']=u'EP'
        if agent.get('rapporteur'):
            meps=[]
            for mep in agent['rapporteur']:
                if unws(mep['name']).startswith("The committee decided not to give an opinion"):
                    del agent['rapporteur'][agent['rapporteur'].index(mep)]
                    agent[u'opinion']=None
                    continue
                tmp=getMEPRef(mep['name'])
                if tmp:
                    meps.append({u'mepref': tmp,
                                 u'group': mep['group'],
                                 u'name': mep['name']})
                else:
                    meps.append({u'group': mep['group'],
                                 u'name': mep['name']})
            agent[u'rapporteur']=meps

        abbr=agent['committee'][:4]
        if abbr=='BUDE': abbr='BUDG'
        if not abbr in COMMITTEE_MAP.keys():
            logger.warn(u"[!] uknown committee abbrev %s" % abbr)
            agent[u'committee_full']=agent['committee']
            if agent['committee'][4]==' ' and abbr.isalpha():
                agent[u'committee']=abbr
        else:
            agent[u'committee_full']=agent['committee'][5:]
            agent[u'committee']=abbr

        if agent.get(u'committee') in shadows.keys():
            agent[u'shadows']=shadows[agent['committee']]

        if not agent in agents: agents.append(agent)
    return agents

示例#26

0

显示文件

文件： amendment.py 项目： parltrack/parltrack

def unpaginate(text, url):
    lines = text.split('\n')
    # find end of 1st page
    eo1p = 0
    PE = None
    while not lines[eo1p].startswith('\x0c') and eo1p<len(lines):
        eo1p+=1
    if eo1p == len(lines):
        log(1, "could not find end of 1st page in %s" % url)
        raise ValueError("eo1p not found: %s" % url)

    i = len(lines) - 1
    while i>=0:
        if not lines[i].startswith('\x0c'):
            i -= 1
            continue

        # we found a line starting with pagebreak
        lines[i]=lines[i][1:]
        i -= 1
        fstart = i

        # skip empty lines before pagebreak
        while i>=0 and unws(lines[i])=='':
            i-=1

        # we expect i>0 and lines[i] == 'EN' (or variations)
        if i<=0:
            log(1, "could not find non-empty line above pagebreak in %s" % url)
            raise ValueError("no EN marker found: %s" % url)

        tmp = unws(lines[i])
        if tmp not in ["EN", "EN EN", "EN United in diversity EN",
                       "EN Unity in diversity EN",
                       "EN Unie dans la diversité EN",
                       "EN In Vielfalt geeint EN",
                       "ENEN United in diversity EN",
                       "XM United in diversity XM",
                       "XT United in diversity EN",
                       "XM", "XM XM", "XT", "XT XT"]:
            if tmp in ["FR",'NL','HU']:
                log(2,'Document has non-english language marker: "%s" %s' % (tmp, url))
                return [], None
            if tmp=="Or. en":
                # no footer in this page
                continue
            if tmp in ['AM_Com_NonLegCompr', 'AM_Com_NonLegReport','AM_Com_NonLegOpinion']:
                # no footer on this page (and probably neither on the previous one which should be the first)
                continue
            # an exceptional document
            if (url=='http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+COMPARL+PE-532.324+01+DOC+PDF+V0//EN&language=EN' and
                tmp in ["Paragraph 8", "Pervenche Berès, Frédéric Daerden"]):
                continue
            if (url in  ['http://www.europarl.europa.eu/doceo/document/CJ25-AM-593898_EN.pdf',
                         'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+COMPARL+PE-593.898+01+DOC+PDF+V0//EN&language=EN'] and
                tmp=="(2016/2204(INI))"):
                continue
            if (url=='http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+COMPARL+PE-594.137+01+DOC+PDF+V0//EN&language=EN' and
                tmp=='(2016/2018(INI))'):
                continue
            if isfooter(tmp):
                if PE is None: # try to figure out PE id
                    m = pere.match(tmp)
                    if m: PE = m.group(0)
                log(3, 'no EN marker found, but footer: "%s"' % tmp)
                i+=1 # neutralize the decrement after this block
            else:
                log(1, 'could not find EN marker above pagebreak: %d %d "%s"' % (i, eo1p, tmp))
                raise ValueError('no EN marker found "%s" in %s' % (tmp,url))

        if lines[i].startswith('\x0c'): # we found a ^LEN^L
            # we found an empty page.
            while fstart > i:
                del lines[fstart]
                fstart -= 1
            lines[i]="\x0c"
            continue

        i -= 1

        # find the next non-empty line above the EN marker
        while i>0 and unws(lines[i])=='':
            i-=1
        if i<=0:
            log(1, "could not find non-empty line above EN marker: %s" % url)
            raise ValueError("no next line above EN marker found: %s" % url)

        if (not isfooter(lines[i])):
            tmp = unws(lines[i])
            if tmp=="Or. en":
                i+=1 # preserve this line - and cut of the rest
            elif tmp not in ['AM_Com_NonLegCompr', 'AM_Com_NonLegReport','AM_Com_NonLegOpinion']:
                log(1,'not a footer: "%s" line: %d in %s' % (repr(lines[i]),i,url))
                raise ValueError('not a footer: "%s" line: %d in %s' % (lines[i],i,url))
        elif PE is None: # try to figure out PE id
            m = pere.match(unws(lines[i]))
            if m: PE = m.group(0)

        if lines[i].startswith('\x0c'):
            # we found an empty page with only the footer
            lines[i]='\x0c'
            i+=1
        #else: # is a regular page
        #    i -= 1
        #    if unws(lines[i])!='':
        #        for j in range(-10,10):
        #            log(1, '"%s"' % (unws(lines[i+j])))
        #        log(1, 'line above footer is not an empty line: "%s"' % (unws(lines[i])))
        #        raise ValueError("no empty line above footer")

        # delete all lines between fstart and i
        while fstart >= i:
            del lines[fstart]
            fstart -= 1
    return lines, PE