Пример #1
0
def extract_table(table, url, date=None):
    trs = table.xpath('.//tr')
    header = trs[0]
    tds = header.xpath('.//td')
    if len(tds) < 2:
        log(
            1, "vote table has less than two columns in the header: %s %s" %
            (url, tostring(trs[0])))
        raise ValueError
    type = junws(tds[1])
    if type not in {"+", "-", "0"}:
        log(
            1, "vote header type is unexpected value %s in %s" %
            (repr(type), url))
        raise ValueError
    res = {'total': int(junws(tds[0])), 'type': type, 'meps': []}
    for tr in trs[1:]:
        tds = tr.xpath('.//td')
        if len(tds) < 2:
            log(
                1, "vote table has less than two columns in the body: %s %s" %
                (url, tostring(tr)))
            raise ValueError
        #grp = junws(tds[0]).split()
        for meps in tds[1].xpath(".//p"):
            meps = junws(meps)
            if not meps: continue
            for m in meps.split(','):
                m = unws(m)
                if not m: continue
                mepid = db.getMep(m, date=date)
                if not mepid:
                    log(2, "could not resolve MEP name: %s" % m)
                res['meps'].append(mepid or m)
    return res
Пример #2
0
def scrape(url, **kwargs):
    log(3,"scraping %s" % (url))
    root = getXML(url)
    if root is None:
        log(1,"could not get votes for", url)
        return # angrily o/
    log(3, "processing plenary votes xml from %s" % url)
    # root is:
    #PV.RollCallVoteResults EP.Number="PE 533.923" EP.Reference="P7_PV(2014)04-17" Sitting.Date="2014-04-17" Sitting.Identifier="1598443"
    votes=[]
    for vote in root.xpath('//RollCallVote.Result'):
        # hrmpf, some EP seriously used the braindead Y-d-m format sometimes in vote timestamps :/
        time = vote.get('Date')
        if len(time.split()) == 2:
            ts = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
        else:
            ts = datetime.strptime(time, "%Y-%m-%d")
        tmp=vote.get('Identifier')
        if tmp:
            voteid = int(tmp)
        else:
            tmp = vote.get('Number')
            if not tmp:
                log(1, "blimey, could not deduce an id for the vote in %s" % url)
                raise ValueError("no id for vote in %s" % url)
            voteid = "%s-%s" % (ts,tmp)
        title = vote.xpath("RollCallVote.Description.Text")
        if len(title) != 1:
            log(2, "holy ambiguity Batman! This vote doesn't have one title, but %d: %d %s" % (len(title), voteid, url))
            title="!unknown!"
        else:
            title=junws(title[0])
        v={u"ts": ts,
           u"url": url,
           u"voteid": voteid,
           u"title": title,
           'votes':{}}
        v.update(votemeta(v['title'], v['ts']))
        if 'epref' not in v:
            ref = vote.xpath("RollCallVote.Description.Text/a/text()")
            if ref:
                v['epref']=unws(ref[0])
        for type, stype in [('Result.For','+'), ('Result.Against','-'), ('Result.Abstention','0')]:
            type = vote.xpath(type)
            if not type: continue
            if len(type)>1:
                log(2, "[pff] more than one %s entry in vote (id:%d) in %s" % (stype, v['voteid'], url))
            type = type[0]
            v['votes'][stype]={'total': int(type.get('Number')),
                               'groups': {}}
            for group in type.xpath('Result.PoliticalGroup.List'):
                g = str(group.get('Identifier'))
                if not g in v['votes'][stype]['groups']:
                    v['votes'][stype]['groups'][g]=[]
                for tag in ['Member.Name', 'PoliticalGroup.Member.Name']:
                    for mep in group.xpath(tag):
                        m = {}
                        name = junws(mep)
                        mepid = mep.get("PersId")
                        if mepid:
                            mepid = int(mepid)
                        else:
                            mepid = db.getMep(name, v['ts'], abbr=g)
                        if mepid:
                            m['mepid']= mepid
                            #if int(mep.get('MepId')) in ambiguous_meps:
                            #    oid = int(mep.get('MepId'))
                            #    ambiguous_meps.remove(oid)
                            #    log(2,'found mepid for previously ambigous obscure_id: "%s": %s' % (oid, mepid))
                        else:
                            mepid = lost_meps.get(mep.get('MepId'))
                            if mepid:
                                m['mepid']= mepid
                            else:
                                m['name']= name
                                m['obscure_id']=int(mep.get('MepId'))  # it's a totally useless and confusing id that is nowhere else used
                        v['votes'][stype]['groups'][g].append(m)
        # save
        process(v, v['voteid'], db.vote, 'ep_votes', v['title'])
        votes.append(v)
    return votes
Пример #3
0
def parse_block(block, url, reference, date, committee, rapporteur, PE):
    am={u'src': url,
        u'peid': PE,
        u'reference': reference,
        u'date': date,
        u'committee': committee}

    #logger.info(block)
    # get title
    try:
        am[u'seq']=int(unws(block[0]).split()[1])
    except ValueError:
        am[u'seq']=unws(block[0]).split()[1]
    except IndexError:
        log(2,"wrong seq %s" % (block[0]))
        am[u'seq']=unws(block[0])
    del block[0]

    pefix = PE.split('v')[0] # we strip of the v0[0-9]-[0-9]{1,2} part of the PEID
    am['id']="%s-%s" % (pefix,am['seq'])

    strip(block)

    # find and strip justification
    i=len(block)-1
    while i>2 and not (unws(block[i])=="Justification" and block[i].startswith(' ' * 6)):
        i-=1
    if i>2:
        if i<len(block)-1 and (not unws(block[i+1]) or not block[i+1].startswith(' ') ):
            am['justification']='\n'.join(block[i+2:])
            del block[i:]
            strip(block)
        else:
            log(2, 'wrong justification in %s: "%s"' % (am['seq'], '\\n'.join(block[i:])))

    # get original language
    if 4<len(unws(block[-1]))<=6 and unws(block[-1]).startswith('Or.'):
        am['orig_lang']=unws(block[-1])[4:]
        del block[-1]
        strip(block)

    # find split column new/old heading
    i=len(block)-1
    while (i>2 and
           not ((block[i].endswith("     Amendment") or
                 block[i].endswith("     PARTICULARS") or
                 block[i].endswith("     Remedy") or
                 block[i].endswith("     Amended text") or
                 block[i].endswith("     Amendement") or
                 block[i].endswith("           Amendments by Parliament") or
                 block[i].endswith("           Proposal for rejection") or
                 block[i].endswith("           Proposal for a rejection") or
                 block[i].endswith("           Does not affect English version") or
                 block[i].endswith("           (Does not affect English version)") or
                 block[i].endswith("      Amendment by Parliament")) and
                len(block[i])>33) and
           not (unws(block[i])=='Text proposed by the Commission' or
                unws(block[i]) in types)):
        i-=1
    if i>2:
        #if block[i].endswith("               Proposal for rejection"):
        #    pass # location will be possibly '-'
        seq=False
        if unws(block[i]) in ["Amendment", "Amendment by Parliament"]:
            # sequential format, search for preceeding original text
            j=i
            while (j>2 and not (unws(block[j]) in types or unws(block[j])=='Text proposed by the Commission')):
                j-=1
            if j>2: i=j
            seq=True; key='old'
        elif unws(block[i])=='Text proposed by the Commission' or block[i].strip() in types:
            seq=True; key='old'
        # throw headers
        del block[i]
        while i<len(block) and not unws(block[i]): del block[i]        # skip blank lines
        mid=max([len(x) for x in block])//2
        while i<len(block):
            if seq:
                if unws(block[i]) in ["Amendment", "Amendment by Parliament", "Text Amended"]:
                    key='new'
                    del block[i]
                    continue
                try: am[key].append(block[i])
                except KeyError: am[key]=[block[i]]
                del block[i]
                continue
            # only new, old is empty
            if block[i].startswith('         '):
                try: am['new'].append(unws(block[i]))
                except KeyError: am['new']=[unws(block[i])]
                del block[i]
                continue
            newstart = block[i].rstrip().rfind('  ')
            # only old, new is empty
            if newstart < 6:
                try: am['old'].append(unws(block[i]))
                except KeyError: am['old']=[unws(block[i])]
                del block[i]
                continue
            #mid=len(block[i])/2
            #mid=40
            lsep=block[i].rfind('  ', 0, mid)
            rsep=block[i].find('  ', mid)
            sep=None
            if abs(lsep-mid)<abs(rsep-mid):
                if abs(lsep-mid)<15:
                    sep=lsep
            else:
                if abs(rsep-mid)<15:
                    sep=rsep
            if sep:
                try: am['old'].append(unws(block[i][:sep]))
                except KeyError: am['old']=[unws(block[i][:sep])]
                try: am['new'].append(unws(block[i][sep:]))
                except KeyError: am['new']=[unws(block[i][sep:])]
            else:
                # no sane split found
                #logger.warn("no split: %s %s\n%s" % (datetime.now().isoformat(),
                #                                     (sep, mid, len(block[i]), newstart, block[i]),
                #                                     block[i][mid-1:mid+2]))
                # fallback to naive splitting
                try: am['old'].append(unws(block[i][:newstart]))
                except KeyError: am['old']=[unws(block[i][:newstart])]
                try: am['new'].append(unws(block[i][newstart:]))
                except KeyError: am['new']=[unws(block[i][newstart:])]
            del block[i]
        strip(block)
    else:
        if not 'Does not affect English version.' in block[i:]:
            log(2, "no table\n%s" % ('\n'.join(block[i:])))
            return None
            #am['content']=block[i:]
            #return am

    i=0
    # find end of authors
    while (i<len(block) and
           unws(block[i]) and
           not unws(block[i]).lower().startswith('compromise') and
           not istype(block[i]) and
           not unws(block[i]).split()[0] in locstarts): i+=1
    if i<len(block):
        if i>0:
            names=' '.join(block[:i])
            am['authors']=names
            #logger.info("names \n%s" % names)

            # convert to pt mep _ids
            for text in filter(None,splitNames(names)):
                mepid=db.getMep(text,date)
                if mepid:
                    try: am['meps'].append(mepid)
                    except KeyError: am['meps']=[mepid]
                else:
                    log(3, "fix %s" % text)
            del block[:i]
            strip(block)
        elif rapporteur:
            am['authors']=rapporteur
            if isinstance(rapporteur,list):
                for text in rapporteur:
                    mepid=db.getMep(text,date)
                    if mepid:
                        try: am['meps'].append(mepid)
                        except KeyError: am['meps']=[mepid]
                    else:
                        log(3, "fix %s" % text)
            else:
                for text in filter(None,splitNames(rapporteur)):
                    mepid=db.getMep(text,date)
                    if mepid:
                        try: am['meps'].append(mepid)
                        except KeyError: am['meps']=[mepid]
                    else:
                        log(3, "fix %s" % text)
        else:
            log(3, "no authors in Amendment %s %s" % (am['seq'], url))
    else:
        log(2, "no boundaries in Amendment %s %s\n%s" % (am['seq'], url,
                                                      '\n'.join(block)))
        am['rest']=block
        return am

    # handle compromise info
    i=0
    while (i<len(block) and
           unws(block[i]) and
           not istype(block[i]) and
           not unws(block[i]).split()[0] in locstarts): i+=1
    if i<len(block) and i>0:
        if [unws(x) for x in block[:i]]!=["Draft proposal for a recommendation"]:
            am['compromise']=block[:i]
        del block[:i]
        strip(block)

    i=0
    while (i<len(block) and unws(block[i])):
        if unws(block[i]).split()[0] in locstarts:
            try: am['location'].append((' '.join(block[:i]),unws(block[i])))
            except KeyError: am['location']=[(' '.join(block[:i]),unws(block[i]))]
            del block[:i+1]
            i=0
        else:
            i+=1
    if len(block)>0 and ((len(block)==1 or
                          not unws(block[1])) and
                         unws(block[0])!='1' and
                         'location' in am):
        am['location'][-1]=(am['location'][-1][0],"%s %s" % (am['location'][-1][1],block[0]))
        del block[0]
        strip(block)

    if block:
        if not ((len(block)==3 and
                unws(block[0])=='1' and
                not unws(block[1]) and
                block[2].startswith("  ")) or
                (len(block)==2 and
                unws(block[0])=='1' and
                block[1].startswith("  "))):
            # ignore obvious footnotes
            log(3, "rest in Amendment %s\n%s" % (am['seq'],'\n'.join(block)))
    return am