示例#1
0
def create_tasks(engine):
    log.info("Updating tasks on pyBossa...")
    app = setup()
    with flask_app.open_resource('resources/pbnetworks_template.html') as f:
        app.info['task_presenter'] = f.read()
        pbclient.update_app(app)
    tasks = pbclient.get_tasks(app.id, limit=30000)
    existing = dict([(t.data.get('info').get('signature'), t) for t in tasks])
    for rep in sl.all(engine, sl.get_table(engine, 'representative')):
        networking = rep.get('networking')
        if networking is None or len(networking.strip()) < 3:
            continue
        signature = rep.get('identification_code') + networking
        signature = sha1(signature.encode('ascii', 'ignore')).hexdigest()
        rep['signature'] = signature
        print [rep.get('name')]
        log.debug("Task: %s", rep['name'])
        rep['last_update_date'] = rep['last_update_date'].isoformat()
        rep['registration_date'] = rep['registration_date'].isoformat()
        #print [(k, type(v)) for k,v in rep.items()]
        if signature in existing:
            task = existing.get(signature)
            task.data['info'] = rep
            pbclient.update_task(task)
        else:
            pbclient.create_task(app.id, rep)
示例#2
0
def load(engine):
    for rep in sl.all(engine, sl.get_table(engine, 'representative')):
        log.info("Loading: %s", rep.get('name'))
        if rep['etl_clean'] is False:
            log.debug("Skipping!")
            continue
        load_representative(engine, rep)
示例#3
0
def load(engine):
    for i, rep in enumerate(sl.all(engine, sl.get_table(engine, 'representative'))):
        log.info("Loading(%s): %s", i, rep.get('name'))
        #if rep['etl_clean'] is False:
        #    log.debug("Skipping!")
        #    continue
        load_representative(engine, rep)
示例#4
0
def transform(engine):
    log.info("Geo-coding representatives...")
    table = sl.get_table(engine, 'representative')
    for row in sl.all(engine, table):
        out = {'id': row['id']}
        if row.get('contact_lon'):
            continue
        query = {
            'format': 'json',
            'limit': 1,
            'city': row.get('contact_town'),
            'street': row.get('contact_street'),
            'country': row.get('contact_country'),
            'postalcode': row.get('contact_post_code')
            }
        response = requests.get(URL, params=query)
        try:
            json = response.json()
        except: continue
        if json and len(json):
            geo = json[0]
            log.info("%s @ %s", row.get('name'), geo.get('display_name'))
            out['contact_geoname'] = geo.get('display_name')
            out['contact_lon'] = geo.get('lon')
            out['contact_lat'] = geo.get('lat')
            sl.upsert(engine, table, out, ['id'])
示例#5
0
def dedup_fields(engine, field):
    table = sl.get_table(engine, 'representative')
    for rep in sl.all(engine, table):
        others = list(sl.find(engine, table, **{field: rep[field]}))
        if len(others) > 1:
            log.info("Duplicates for: %s", rep['name'])
            for i, re in enumerate(others):
                text = "(Duplicate %s)" % (i+1)
                sl.upsert(engine, table,
                    {'name_suffix': text,
                     'identification_code': re['identification_code']},
                    ['identification_code'])
示例#6
0
文件: dedup.py 项目: stef/lobbyfacts
def dedup_fields(engine, field):
    table = sl.get_table(engine, 'representative')
    seen=set([])
    for n, rep in enumerate(sl.all(engine, table)):
        if n % 100 == 0:
            print n, 'done'
        if not rep[field] or not rep[field].strip() or rep[field] in seen: continue
        seen.update(rep[field])
        others = list(sl.find(engine, table, **{field: rep[field]}))
        if len(others) > 1:
            log.info("Duplicates for: %s", rep['name'])
            for i, re in enumerate(others):
                if re == rep: continue
                text = "(Duplicate %s)" % (i+1)
                sl.upsert(engine, table,
                    {'name_suffix': text,
                     'identification_code': re['identification_code']},
                    ['identification_code'])
示例#7
0
def load(engine):
    for i, meet in enumerate(sl.all(engine, sl.get_table(engine, 'meeting'))):
        log.info("Loading(%s): %s", i, meet.get('name'))
        load_meeting(engine, meet)