示例#1
0
def save(data, stats):
    res=Mep.get_by_id(data['UserID'])
    if res is not None:
        if 'Gender' not in data and 'Gender' in res.data: data['Gender']=res['Gender']
        d=diff(dict([(k,v) for k,v in res.data.items() if not k in ['meta', 'changes', 'activities',]]),
               dict([(k,v) for k,v in data.items() if not k in ['meta', 'changes', 'activities',]]))
        data['changes']=res.data.get('changes',{})
    else:
        d=diff({}, dict([(k,v) for k,v in data.items() if not k in ['meta', 'changes', 'activities',]]))
        data['changes']={}
    if d:
        now=datetime.utcnow().replace(microsecond=0)
        if not res:
            logger.info('adding %s' % (data['Name']['full']))
            data['meta']['created']=now
            if stats: stats[0]+=1
            data['changes']={}
        else:
            logger.info('updating %s' % (data['Name']['full']))
            logger.warn(jdump(d))
            data['meta']['updated']=now
            if stats: stats[1]+=1
            data['id']=res.id
            data['changes']=res.data.get('changes',{})
        data['changes'][now.isoformat()]=d
        Mep.upsert(data)
    del res
    if stats:
        del data
        return stats
    else: return data
示例#2
0
def run(args):
    if len(args)<1:
        print("possible options: full|test|mepid <mepid>|"+'|'.join(meplists.keys()))
        return
    if args[0]=="test":
        yield scrape('28215')
        yield scrape('113959')

        #print jdump(scrape('108570')).encode('utf8')
        #print jdump(scrape('1934')).encode('utf8')
        #print jdump(scrape('96919')).encode('utf8')
        #import code; code.interact(local=locals());
        return
        yield scrape("http://www.europarl.europa.eu/meps/en/1934/get.html")
        yield scrape("http://www.europarl.europa.eu/meps/en/28576/get.html")
        yield scrape("http://www.europarl.europa.eu/meps/en/1263/Elmar_BROK.html")
        yield scrape("http://www.europarl.europa.eu/meps/en/96739/Reinhard_B%C3%9CTIKOFER.html")
        yield scrape("http://www.europarl.europa.eu/meps/en/28269/Jerzy_BUZEK.html")
        yield scrape("http://www.europarl.europa.eu/meps/en/1186/Astrid_LULLING.html")

    elif args[0]=='mepid' and args[1]:
        yield jdump(scrape(int(args[1])))

    elif args[0] in meplists.keys():
        #s=Multiplexer(scrape,save,threads=4)
        #def _crawler():
        #    return crawler(args[0])
        #s.run(_crawler)
        yield (scrape, crawler(args[0]))
        return
示例#3
0
def save(data, stats):
    if not data: return stats
    res=Dossier.get_by_src(data['meta']['source'])
    if res is not None:
        d=diff(dict([(k,v) for k,v in res.items() if not k in ['meta', 'changes']]),
               dict([(k,v) for k,v in data.items() if not k in ['meta', 'changes',]]))
        data['changes']=res.data.get('changes',{})
    else:
        d=diff({}, dict([(k,v) for k,v in data.items() if not k in ['meta', 'changes',]]))
        data['changes']={}
    #logger.warn(pprint.pformat(d))
    if d:
        now=data['meta']['timestamp'].replace(microsecond=0).isoformat()
        del data['meta']['timestamp']
        if not res:
            logger.info(('adding %s - %s' % (data['procedure']['reference'],data['procedure']['title'])).encode('utf8'))
            data['meta']['created']=now
            stats[0]+=1
        else:
            logger.info(('updating  %s - %s' % (data['procedure']['reference'],data['procedure']['title'])).encode('utf8'))
            data['meta']['updated']=now
            stats[1]+=1
            logger.info(jdump(d))
        #if not NOMAIL:
        #    m=db.notifications.find({'dossiers': data['procedure']['reference']},['active_emails'])
        #    for g in m:
        #        if len(g['active_emails'])==0:
        #            continue
        #        msg = Message("[PT] %s %s" % (data['procedure']['reference'],data['procedure']['title']),
        #                      sender = "*****@*****.**",
        #                      bcc = g['active_emails'])
        #        #msg.html = htmldiff(data,d)
        #        msg.body = makemsg(data,d)
        #        mail.send(msg)
        #logger.info(htmldiff(data,d))
        #logger.info(makemsg(data,d))
        data['changes'][now]=d
        Dossier.upsert(data)
    return stats
示例#4
0
def process(obj,
            id,
            getter,
            table,
            name,
            nopreserve=None,
            nodiff=False,
            nostore=False,
            onchanged=None):
    if nopreserve is None: nopreserve = []
    # clear out empty values
    obj = {k: v for k, v in obj.items() if v or v == False}

    if nodiff:
        now = datetime.utcnow().replace(microsecond=0)
        if not 'meta' in obj: obj['meta'] = {}
        log(3, 'adding %s (%s)' % (name, id))
        obj['meta']['created'] = now
        obj['changes'] = {}
        if not nostore and not db.put(table, obj):
            log(1, "failed to store updated obj {}".format(id))
            raise ValueError
        if onchanged is not None:
            onchanged(obj, d)
        return

    # generate diff
    prev = getter(id)
    if prev is not None:
        d = diff(
            {
                k: v
                for k, v in prev.items()
                if not k in ['meta', 'changes', '_id']
            }, {
                k: v
                for k, v in obj.items() if not k in ['meta', 'changes', '_id']
            })

        # preserve some top level items
        d1 = []
        for c in d:
            if c['type'] != 'deleted' or len(
                    c['path']) != 1 or c['path'][0] in nopreserve:
                d1.append(c)
                continue
            if c['type'] == 'deleted' and len(
                    c['path']) == 1 and c['data'] in ({}, []):
                d1.append(c)
                continue
            log(
                3, "preserving deleted path {} for obj id: {}".format(
                    c['path'], id))
            obj[c['path'][0]] = prev[c['path'][0]]
        d = d1
    else:
        d = diff({}, {
            k: v
            for k, v in obj.items() if not k in ['meta', 'changes', '_id']
        })

    if d:
        # attempt to recreate current version by applying d to prev
        o2 = patch(prev or {}, json.loads(jdump(d)))
        if not o2:
            log(
                1,
                "failed to recreate {} record by patching previous version with diff"
                .format(id))
            raise ValueError
        else:
            # make a diff between current record, an recreated one
            zero = diff(
                {
                    k: v
                    for k, v in o2.items()
                    if not k in ['meta', 'changes', '_id']
                }, {
                    k: v
                    for k, v in obj.items()
                    if not k in ['meta', 'changes', '_id']
                })
            if zero != []:
                log(
                    1,
                    "id:{} diff between current record and patched previous one is not empty\n{!r}"
                    .format(id, zero))
                raise ValueError(
                    "diff between new and patched old is not empty")

        now = datetime.utcnow().replace(microsecond=0)
        if not 'meta' in obj: obj['meta'] = {}
        if not prev or nodiff:
            log(3, 'adding %s (%s)' % (name, id))
            obj['meta']['created'] = now
            obj['changes'] = {}
        else:
            log(3, 'updating %s (%s)' % (name, id))
            log(4, "changes for %s\n%s" % (id, jdump(d)))
            obj['meta']['updated'] = now
            obj['changes'] = prev.get('changes', {})
            obj['changes'][now.isoformat()] = d
        if not nostore and not db.put(table, obj):
            log(1, "failed to store updated obj {}".format(id))
            raise ValueError
        if onchanged is not None:
            onchanged(obj, d)
    del prev
    if __name__ == '__main__':
        print(jdump(obj))
    return obj
示例#5
0
                            mepid = int(mepid)
                        else:
                            mepid = db.getMep(name, v['ts'], abbr=g)
                        if mepid:
                            m['mepid']= mepid
                            #if int(mep.get('MepId')) in ambiguous_meps:
                            #    oid = int(mep.get('MepId'))
                            #    ambiguous_meps.remove(oid)
                            #    log(2,'found mepid for previously ambigous obscure_id: "%s": %s' % (oid, mepid))
                        else:
                            mepid = lost_meps.get(mep.get('MepId'))
                            if mepid:
                                m['mepid']= mepid
                            else:
                                m['name']= name
                                m['obscure_id']=int(mep.get('MepId'))  # it's a totally useless and confusing id that is nowhere else used
                        v['votes'][stype]['groups'][g].append(m)
        # save
        process(v, v['voteid'], db.vote, 'ep_votes', v['title'])
        votes.append(v)
    return votes

from utils.process import publish_logs
def onfinished(daisy=True):
    publish_logs(get_all_jobs)

if __name__ == '__main__':
    import sys
    url = sys.argv[1]
    print(jdump(scrape(url)))
示例#6
0

def onfinished(daisy=True):
    from utils.process import publish_logs
    publish_logs(get_all_jobs)


if __name__ == '__main__':
    #print(jdump(scrape(1275)))
    #scrape(28390)
    #scrape(96779)
    #scrape(96674)
    #scrape(28469)
    #scrape(96843)
    #scrape(1393) # 1-3rd term
    #scrape(96992)
    #scrape(1275)
    # test written decl:
    #print(jdump(scrape(28266, [8], "some MEP")))
    # test written expl:
    #print(jdump(scrape(197682, [9], "some MEP")))
    # test plen spch
    #print(jdump(scrape(28266, [9], "some MEP")))
    # test report-shadow with double committee
    #print(jdump(scrape(28266, [7,8,9], "some MEP")))
    # major interpellations:
    print(jdump(scrape(131749, [7, 8, 9], "some MEP")))

    #import sys
    #print(jdump(scrape(int(sys.argv[1]), [9], "some MEP")))
示例#7
0
        logger.error('[wtf] did not reach final state %s' % state)
        return {}
    else:
        if (len(data['occupation'])>1 and
            data['occupation'][-1][0] in [u"No occupation held during the three years preceding the current mandate",
                                          u"Καμία επαγγελματική δραστηριότητα κατά τη διάρκεια των τριών ετών που προηγήθηκαν της τρέχουσας εντολής",
                                          u"Atividade Liberal como autor/outras atividades artísticas (remuneração inferior a 500 € na totalidade dos 3 anos anteriores)",
                                          u"Brak działalności zawodowej w okresie trzech lat poprzedzających obecną kadencję",
                                          u"Geen beroep uitgeoefend gedurende de drie jaar voorafgaand aan de huidige zittingsperiode",
                                          u"Nessuna attività svolta durante i tre anni precedenti l'attuale mandato",
                                          u"Keine Berufstätigkeit während des Dreijahreszeitraums vor der laufenden Wahlperiode",
                                          u"Aucune activité professionnelle au cours des trois années ayant précédé le présent mandat",
                                          u"Sin ocupación durante los tres años anteriores al actual mandato",
                                          u"Intet erhvervsarbejde i de tre år forud for det nuværende mandate",
                                          u"Nicio activitate profesională în ultimii trei ani dinaintea preluării mandatului actual",
                                          u"Har inte utövat någon yrkesmässig verksamhet under de tre år som föregick det nuvarande mandatet",
                                          u"Sem atividade profissional durante os três anos que precederam o atual mandato",
                                          u"Nepostojanje profesionalne djelatnosti tijekom tri godine prije aktualnog mandata",
                                          u"Ei ammatillista toimintaa kolmena nykyistä edustajantointa edeltävänä vuotena",
                                          u"A jelenlegi megbízatást megelőző három évben nem végzett foglalkozást.",
                                          u"Без професионална дейност по време на трите години, предшестващи текущия мандат",
                                          u"Během tří let před současným mandátem jsem nevykonával(a) žádnou profesní činnost.",
            ]):
            del data['occupation'][-1]
        return data

if __name__ == "__main__":
    DEBUG=True
    print(jdump(scrape(sys.argv[1])).encode('utf8'))
    #scrape(sys.argv[1])
示例#8
0
    from utils.process import publish_logs
    publish_logs(get_all_jobs)


if __name__ == '__main__':
    #print(jdump(scrape(1275)))
    #scrape(28390)
    #scrape(96779)
    #scrape(96674)
    #scrape(28469)
    #scrape(96843)
    #scrape(1393) # 1-3rd term
    #scrape(96992)
    #scrape(1275)
    # test written decl:
    #print(jdump(scrape(28266, [8], "some MEP")))
    # test written expl:
    #print(jdump(scrape(197682, [9], "some MEP")))
    # test plen spch
    #print(jdump(scrape(28266, [9], "some MEP")))
    # test report-shadow with double committee
    #print(jdump(scrape(28266, [7,8,9], "some MEP")))
    # major interpellations:
    #print(jdump(scrape(131749, [7,8,9], "some MEP")))
    #print(jdump(scrape(205452, [9], 'Chris MACMANUS')))
    #print(jdump(scrape(204400, [9], 'Adrián VÁZQUEZ LÁZARA', save=False)))
    print(jdump(scrape(197767, [9], 'Eugen JURZYCA', save=False)))

    #import sys
    #print(jdump(scrape(int(sys.argv[1]), [9], "some MEP")))
示例#9
0
                    pass
            continue

        if amstart.match(line):
            # parse block
            am=parse_block(block, url, reference, date, committee, meps, PE)
            if am is not None:
                process(am, am['id'], db.amendment, 'ep_amendments', am['reference']+' '+am['id'], nodiff=True)
                res.append(am)
            block=[line]
            continue
        block.append(line)
    if block and filter(None,block):
        am = parse_block(block, url, reference, date, committee, meps, PE)
        if am is not None:
            process(am, am['id'], db.amendment, 'ep_amendments', am['reference']+' '+am['id'], nodiff=True)
            res.append(am)
    log(3,"total amendments %d in %s" % (len(res),url))
    return res

from utils.process import publish_logs
def onfinished(daisy=True):
    publish_logs(get_all_jobs)

if __name__ == "__main__":
    from utils.utils import jdump
    #print(jdump(scrape('https://www.europarl.europa.eu/doceo/document/INTA-AM-658734_EN.pdf', ['Enikő GYŐRI'])))
    #print(jdump(scrape("http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+COMPARL+PE-609.623+01+DOC+PDF+V0//EN&language=EN", "Krišjānis Kariņš")))
    #print(jdump(scrape(sys.argv[1],"ANDERSSON Max")))
    print(jdump(scrape(sys.argv[1],sys.argv[2])))
示例#10
0
        log(1, "sidebar has not 1 element: %s" % url)
        raise ValueError
    for li in sidebar[0].xpath('./li'):
        title = li.xpath('./a/span[@class="t-x"]/text()')
        if len(title) != 1:
            log(1, "title has not 1 element: %s" % url)
            raise ValueError
        title = unws(title[0])
        if title not in known_sidebar:
            log(2, '"%s" not in known_sidebar items, in %s' % (title, url))
        subtitles = li.xpath('.//div/ul/li/a/span[@class="t-x"]/text()')
        for s in subtitles:
            s = unws(s)
            if s not in known_sidebar[title]:
                log(
                    2, '"%s" -> "%s" not in known_sidebar items, in %s' %
                    (title, s, url))


if __name__ == '__main__':
    #scrape(28390)
    #scrape(96779)
    #scrape(96674)
    #scrape(28469)
    #scrape(96843)
    #scrape(1393) # 1-3rd term
    #scrape(96992)
    #scrape(1275)
    print(jdump(scrape(int(sys.argv[1]))))
    #print(jdump({k: v for k,v in scrape(1428).items() if k not in ['changes']}))
示例#11
0
                          "\n %s" % (textdiff(diff) if diff else ''),
                          "%sdossier/%s" % (ROOT_URL, doc['epdoc']),
                      ))))


from utils.process import publish_logs


def onfinished(daisy=True):
    publish_logs(get_all_jobs)


if __name__ == "__main__":
    if len(sys.argv) > 1:
        if sys.argv[1] == 'url' and len(sys.argv) == 4:
            print(jdump(scrape(sys.argv[2], sys.argv[3])))
            sys.exit(0)
        elif sys.argv[1] == "url":
            print('-' * 30)
            print(jdump(scrape(sys.argv[2], 'XXXX')))
            print('-' * 30)
            sys.exit(0)
        if sys.argv[1] == "test":
            #print(jdump([(u,d) for u,d in getComAgendas()]))
            print(
                jdump(
                    scrape(
                        'http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=LIBE-OJ-20120112-1&language=EN',
                        'LIBE')))
            #import code; code.interact(local=locals());
            sys.exit(0)
示例#12
0
                        name = junws(mep)
                        mepid = db.getMep(name, v['ts'], abbr=g)
                        if mepid:
                            m['mepid']= mepid
                            #if int(mep.get('MepId')) in ambiguous_meps:
                            #    oid = int(mep.get('MepId'))
                            #    ambiguous_meps.remove(oid)
                            #    log(2,'found mepid for previously ambigous obscure_id: "%s": %s' % (oid, mepid))
                        else:
                            mepid = lost_meps.get(mep.get('MepId'))
                            if mepid:
                                m['mepid']= mepid
                            else:
                                m['name']= name
                                m['obscure_id']=int(mep.get('MepId'))  # it's a totally useless and confusing id that is nowhere else used
                        v['votes'][stype]['groups'][g].append(m)
        # save
        process(v, v['voteid'], db.vote, 'ep_votes', v['title'])
        votes.append(v)
    return votes

from utils.process import publish_logs
def onfinished(daisy=True):
    publish_logs(get_all_jobs)

if __name__ == '__main__':
    import sys
    term = int(sys.argv[1])
    date = sys.argv[2]
    print(jdump(scrape(term, date)))
示例#13
0
                del (item['title'])
            if item.get('body') == 'EC' and len(d.get('commission', [])) == 1:
                item.update(d['commission'][0])
            if isinstance(item['date'], list):
                if not len(item['date']):
                    continue
                if len(set(item['date'])) == 1:
                    item['date'] = item['date'][0]
                else:
                    print("more than one date in: ", item)
            if not item.get("body") and item.get(
                    'type') != 'Final act published in Official Journal':
                log(
                    2, "merge_events: no body for {!r}".format(
                        {k: v
                         for k, v in item.items() if k != 'summary'}))
                #continue #print(item)
            activities.append(item)
    res = sorted(activities,
                 key=lambda x: x['date'][0]
                 if isinstance(x['date'], list) else x['date'],
                 reverse=True)
    return res


if __name__ == '__main__':
    from db import db
    d = db.dossier('2016/0279(COD)')
    from utils.utils import jdump
    print(jdump(merge_events(d)))