def gen_data(start_year=None, end_year=None, mode=False, **kwargs): """Generates historical or current data""" end_year = int(end_year or dt.now().year) + 1 start_year = start_year or end_year - 1 years = range(start_year, end_year) appeals_mode = mode.startswith('A') cluster_mode = mode.startswith('C') emergency_mode = mode.startswith('E') base = kwargs['BASE_URL'] suffix = kwargs['SUFFIX'] for year in years: appeals = urlopen('%s/appeal/year/%s%s' % (base, year, suffix)) emergencies = urlopen('%s/emergency/year/%s%s' % (base, year, suffix)) if appeals_mode or cluster_mode: data_items = items(appeals, kwargs['DATA_LOCATION']) emergency_items = items(emergencies, kwargs['DATA_LOCATION']) emergency_lookup = {e['id']: e['title'] for e in emergency_items} else: data_items = items(emergencies, kwargs['DATA_LOCATION']) for item in data_items: if appeals_mode or cluster_mode: emergency_id = item['emergency_id'] emergency_name = emergency_lookup.get(emergency_id, 'N/A') appeal_id = item['id'] url = '%s/project/appeal/%s%s' % (base, appeal_id, suffix) else: emergency_id = item['id'] url = '%s/funding%s?groupby=country&emergency=%s' % ( base, suffix, emergency_id) emergency_name = item['title'] record = { 'emergency_id': emergency_id, 'emergency_name': emergency_name, 'countries': _find_countries(item['country'], url), 'year': item['year'], } if appeals_mode or cluster_mode: record.update({ 'appeal_id': appeal_id, 'appeal_name': item['title'], 'funding_type': item['type'] }) if appeals_mode or emergency_mode: yield merge(record, _make_requirements(item)) else: url = '%s/cluster/appeal/%s%s' % (base, appeal_id, suffix) r = requests.get(url) for cluster in r.json(): additional = _make_requirements(cluster) additional['cluster'] = cluster['name'] yield merge(record, additional)
def init(start, end): """Initializes db with historical data""" with app.app_context(): extra = {'models': models, 'end': end, 'start': start} kwargs = merge([app.config, extra]) job = partial(swutils.populate, utils.gen_data, db.engine, **kwargs) utils.run_or_schedule(job, False, utils.exception_handler)
def run(): """Populates all tables in db with most recent data""" with app.app_context(): args = (config.RECIPIENT, app.config.get('LOGFILE'), __title__) exception_handler = swutils.ExceptionHandler(*args).handler kwargs = merge([app.config, {'models': models}]) job = partial(swutils.populate, utils.gen_data, db.engine, **kwargs) swutils.run_or_schedule(job, app.config['SW'], exception_handler)
def run(): """Populates all tables in db with most recent data""" with app.app_context(): args = (config.RECIPIENT, app.config.get('LOGFILE'), __title__) exception_handler = swutils.ExceptionHandler(*args).handler extra = {'mixin': BaseMixin, 'get_name': lambda x: 'ym%s' % x} kwargs = merge([app.config, extra]) job = partial(swutils.populate, utils.gen_data, db.engine, **kwargs) swutils.run_or_schedule(job, app.config['SW'], exception_handler)
def run(): """Populates all tables in db with most recent data""" with app.app_context(): args = (config.RECIPIENT, app.config.get('LOGFILE'), __title__) exception_handler = swutils.ExceptionHandler(*args).handler extra = { 'mixin': BaseMixin, 'get_name': partial(slugify, separator='_'), 'fetch': utils.fetch} kwargs = merge([app.config, extra]) job = partial(swutils.populate, db.engine, **kwargs) job()
def normalize(records, **kwargs): first = records.next() reconstituted = it.chain([first], records) filterfunc = lambda x: x[0].startswith('y') base = dict(it.ifilterfalse(filterfunc, first.items())) for record in reconstituted: values = it.ifilter(filterfunc, record.items()) for addon in ({'year': v[0][1:], 'value': v[1]} for v in values): if not ft.is_null(addon['value'], blanks_as_nulls=True): yield pr.merge([base, addon])
def run(): """Populates all tables in db with most recent data""" with app.app_context(): args = (config.RECIPIENT, app.config.get('LOGFILE'), __title__) exception_handler = swutils.ExceptionHandler(*args).handler extra = { 'fetch': utils.fetch, 'get_name': utils.get_name, 'normalize': utils.normalize, 'filterer': utils.filterer, 'parse': utils.parse} kwargs = merge([app.config, extra]) job = partial(swutils.populate, db.engine, models, **kwargs) swutils.run_or_schedule(job, app.config['SW'], exception_handler)
def update(pid, **kwargs): """Updates a package (aka dataset)""" kw = ft.Objectify(kwargs, type='dataset') verbose = not kw.quiet ckan_kwargs = {k: v for k, v in kwargs.items() if k in api.CKAN_KEYS} ckan = CKAN(**ckan_kwargs) licenses = it.imap(itemgetter('id'), ckan.license_list()) groups = ckan.group_list() raw_tags = filter(None, kw.tags.split(',')) if kw.tags else [] tags = [{'state': 'active', 'name': t} for t in raw_tags] if kw.start: start = parse(str(kw.start)).strftime('%m/%d/%Y') else: date = None if kw.start and kw.end: date = '%s-%s' % (start, parse(str(kw.end)).strftime('%m/%d/%Y')) elif kw.start: date = start if kw.location and kw.location in set(groups): group_list = [{'name': kw.location}] elif kw.location: sys.exit('group name: %s not found!' % kw.location) else: group_list = [] if kw.license_id and kw.license_id not in set(licenses): sys.exit('license id: %s not found!' % kw.license_id) package_kwargs = { 'title': kw.title, 'name': kw.name, 'license_id': kw.license_id, 'dataset_source': kw.source, 'notes': kw.description or kw.title, 'type': kw.type, 'tags': tags, 'groups': group_list, 'dataset_date': date, 'caveats': kw.caveats, 'methodology': methods.get(kw.methodology, 'Other'), 'methodology_other': methods.get(kw.methodology) or kw.methodology, } try: old_package = ckan.package_show(id=pid) except api.ValidationError as e: exit(e) if any(package_kwargs.values()): # combine keys by returning the last non-empty result pred = lambda key: True last = lambda pair: filter(None, pair)[-1] if any(pair) else None records = [old_package, package_kwargs] new_kwargs = pr.merge(records, pred=pred, op=last) if verbose: print('Submitting your package request.') pprint(new_kwargs) print('\n') package = ckan.package_update(**new_kwargs) else: package = old_package if kw.private: org = package['organization'] ckan.package_privatize(org_id=org['id'], datasets=[package['id']]) print(package['id']) print('\n')
def populate(engine, models=None, get_name=None, **kwargs): """Populates a SQLAlchemy db with data. Supports both declarative SQLAlchemy and Flask-SQLAlchemy Note: Either `TABLES` or `KEY` must be defined. Args: gen_data (func): A function used to generate the data to be inserted into the db. It will receive keywords comprised of combining `kwargs` with a table defined in `TABLES`. engine (obj): A SQLAlchemy engine. models (module): A models module of SQLAlchemy table classes (default: None). get_name (func): A function used to generate the table name if `TABLES` is unset. It will receive the name of each each grouped obtained by grouping the data generated from `gen_data` (default: None). kwargs (dict): Keyword arguments passed to `gen_data`. Kwargs: mixin (class): Base table that dynamically create tables inherit. Required if `TABLES` is unset. TABLES (list[dicts]): The table options. Required if `KEY` is unset. KEY (str): The field used to group data generated from `gen_data`. Required if `TABLES` is unset. ROW_LIMIT (int): The max total number of rows to process CHUNK_SIZE (int): The max number of rows to process at one time DEBUG (bool): Run in debug mode TESTING (bool): Run in test mode Examples: >>> # Test dynamic tables >>> from sqlalchemy import create_engine >>> class BaseMixin(object): ... id = Column(Integer, primary_key=True) ... value = Column(Integer) ... >>> meta = MetaData() >>> kwargs = {'KEY': 'kind', 'ROW_LIMIT': 4, 'mixin': BaseMixin} >>> f = lambda x: {'kind': 'odd' if x % 2 else 'even', 'value': x} >>> gen_data = lambda **x: map(f, range(15)) >>> engine = create_engine('sqlite:///:memory:') >>> populate(gen_data, engine, **kwargs) >>> session = sessionmaker(engine)() >>> meta.reflect(engine) >>> tables = meta.sorted_tables >>> dict(session.query(tables[0]).all()) == {1: 0, 2: 2, 3: 4, 4: 6} True >>> dict(session.query(tables[1]).all()) == {1: 1, 2: 3, 3: 5, 4: 7} True >>> meta.drop_all(engine) >>> >>> # Test tables without specifying the `rid` >>> Base = declarative_base() >>> class Single(Base): ... __tablename__ = 'single' ... id = Column(Integer, primary_key=True) ... rid = Column(Integer) ... value = Column(Integer) ... >>> class Triple(Base): ... __tablename__ = 'triple' ... id = Column(Integer, primary_key=True) ... rid = Column(Integer) ... value = Column(Integer) ... >>> options = [ ... {'mul': 1, 'name': 'single'}, {'mul': 3, 'name': 'triple'}] >>> kwargs = {'TABLES': options, 'ROW_LIMIT': 4} >>> def gen_data(**x): ... return ({'value': n * x['mul'], 'rid': n} for n in it.count()) >>> Base.metadata.create_all(engine) >>> populate(gen_data, engine, **kwargs) >>> Base.metadata.reflect(engine) >>> tables = Base.metadata.sorted_tables >>> session.query(tables[0]).all() [(1, 0, 0), (2, 1, 1), (3, 2, 2), (4, 3, 3)] >>> session.query(tables[1]).all() [(1, 0, 0), (2, 1, 3), (3, 2, 6), (4, 3, 9)] >>> >>> # Test tables with a specified `rid` >>> populate(gen_data, engine, rid='rid', **kwargs) >>> Base.metadata.reflect(engine) >>> tables = Base.metadata.sorted_tables >>> session.query(tables[0]).all() [(1, 0, 0), (2, 1, 1), (3, 2, 2), (4, 3, 3)] >>> session.query(tables[1]).all() [(1, 0, 0), (2, 1, 3), (3, 2, 6), (4, 3, 9)] Returns str: The message """ log_level = logging.DEBUG if kwargs.get("DEBUG") else logging.INFO logger.setLevel(log_level) console_handler = logging.StreamHandler() logger.addHandler(console_handler) test = kwargs.get("TESTING") row_limit = kwargs.get("ROW_LIMIT") tables = kwargs.get("TABLES") chunk_size = min(row_limit or "inf", kwargs.get("CHUNK_SIZE", row_limit)) engine.session = sessionmaker(engine)() dynamic = not tables if test: meta.create_all(engine) if dynamic: data = gen_data(**kwargs) tables = get_tables(data, kwargs["KEY"]) result_func = partial(get_dynamic_res, engine, get_name, **kwargs) elif models: result_func = partial(res_from_models, models, **kwargs) else: result_func = partial(res_from_meta, engine, **kwargs) for t in tables: count = 0 data = data if dynamic else gen_data(**pr.merge([kwargs, t])) result = result_func(t, data=data) table, rid, data = result["table"], result["rid"], result["data"] table.name = table.__table__.name table.query = engine.session.query(table) del_count = delete_records(table, rid, engine) if del_count: logger.debug(get_message(del_count, table.name)) for records in ft.chunk(data, chunk_size): del_count, in_count = execute(records, engine, table, rid) count += in_count if del_count: logger.debug(get_message(del_count, table.name)) logger.debug(get_message(in_count, table.name, False)) if test: pprint(records) if row_limit and count >= row_limit: break logger.debug("Success! %s" % get_message(count, table.name, False))
def gen_data(start_year=None, end_year=None, mode=False, **kwargs): """Generates historical or current data""" end_year = int(end_year or dt.now().year) + 1 start_year = start_year or end_year - 1 years = range(start_year, end_year) appeals_mode = mode.startswith('A') cluster_mode = mode.startswith('C') emergency_mode = mode.startswith('E') base = kwargs['BASE_URL'] suffix = kwargs['SUFFIX'] for year in years: appeals = urlopen('%s/appeal/year/%s%s' % (base, year, suffix)) emergencies = urlopen('%s/emergency/year/%s%s' % ( base, year, suffix)) if appeals_mode or cluster_mode: data_items = items(appeals, kwargs['DATA_LOCATION']) emergency_items = items( emergencies, kwargs['DATA_LOCATION']) emergency_lookup = { e['id']: e['title'] for e in emergency_items} else: data_items = items(emergencies, kwargs['DATA_LOCATION']) for item in data_items: if appeals_mode or cluster_mode: emergency_id = item['emergency_id'] emergency_name = emergency_lookup.get(emergency_id, 'N/A') appeal_id = item['id'] url = '%s/project/appeal/%s%s' % (base, appeal_id, suffix) else: emergency_id = item['id'] url = '%s/funding%s?groupby=country&emergency=%s' % ( base, suffix, emergency_id) emergency_name = item['title'] record = { 'emergency_id': emergency_id, 'emergency_name': emergency_name, 'countries': _find_countries(item['country'], url), 'year': item['year'], } if appeals_mode or cluster_mode: record.update({ 'appeal_id': appeal_id, 'appeal_name': item['title'], 'funding_type': item['type']}) if appeals_mode or emergency_mode: yield merge(record, _make_requirements(item)) else: url = '%s/cluster/appeal/%s%s' % (base, appeal_id, suffix) r = requests.get(url) for cluster in r.json(): additional = _make_requirements(cluster) additional['cluster'] = cluster['name'] yield merge(record, additional)
def populate(): """Populates db with most recent data""" with app.app_context(): extra = {'mixin': models.BaseMixin, 'get_name': lambda x: 'ym%s' % x} kwargs = merge([app.config, extra]) swutils.populate(utils.gen_data, db.engine, **kwargs)
def populate(engine, models=None, get_name=None, **kwargs): """Populates a SQLAlchemy db with data. Supports both declarative SQLAlchemy and Flask-SQLAlchemy Note: Either `TABLES` or `KEY` must be defined. Args: gen_data (func): A function used to generate the data to be inserted into the db. It will receive keywords comprised of combining `kwargs` with a table defined in `TABLES`. engine (obj): A SQLAlchemy engine. models (module): A models module of SQLAlchemy table classes (default: None). get_name (func): A function used to generate the table name if `TABLES` is unset. It will receive the name of each each grouped obtained by grouping the data generated from `gen_data` (default: None). kwargs (dict): Keyword arguments passed to `gen_data`. Kwargs: mixin (class): Base table that dynamically create tables inherit. Required if `TABLES` is unset. TABLES (list[dicts]): The table options. Required if `KEY` is unset. KEY (str): The field used to group data generated from `gen_data`. Required if `TABLES` is unset. ROW_LIMIT (int): The max total number of rows to process CHUNK_SIZE (int): The max number of rows to process at one time DEBUG (bool): Run in debug mode TESTING (bool): Run in test mode Examples: >>> # Test dynamic tables >>> from sqlalchemy import create_engine >>> class BaseMixin(object): ... id = Column(Integer, primary_key=True) ... value = Column(Integer) ... >>> meta = MetaData() >>> kwargs = {'KEY': 'kind', 'ROW_LIMIT': 4, 'mixin': BaseMixin} >>> f = lambda x: {'kind': 'odd' if x % 2 else 'even', 'value': x} >>> gen_data = lambda **x: map(f, range(15)) >>> engine = create_engine('sqlite:///:memory:') >>> populate(gen_data, engine, **kwargs) >>> session = sessionmaker(engine)() >>> meta.reflect(engine) >>> tables = meta.sorted_tables >>> dict(session.query(tables[0]).all()) == {1: 0, 2: 2, 3: 4, 4: 6} True >>> dict(session.query(tables[1]).all()) == {1: 1, 2: 3, 3: 5, 4: 7} True >>> meta.drop_all(engine) >>> >>> # Test tables without specifying the `rid` >>> Base = declarative_base() >>> class Single(Base): ... __tablename__ = 'single' ... id = Column(Integer, primary_key=True) ... rid = Column(Integer) ... value = Column(Integer) ... >>> class Triple(Base): ... __tablename__ = 'triple' ... id = Column(Integer, primary_key=True) ... rid = Column(Integer) ... value = Column(Integer) ... >>> options = [ ... {'mul': 1, 'name': 'single'}, {'mul': 3, 'name': 'triple'}] >>> kwargs = {'TABLES': options, 'ROW_LIMIT': 4} >>> def gen_data(**x): ... return ({'value': n * x['mul'], 'rid': n} for n in it.count()) >>> Base.metadata.create_all(engine) >>> populate(gen_data, engine, **kwargs) >>> Base.metadata.reflect(engine) >>> tables = Base.metadata.sorted_tables >>> session.query(tables[0]).all() [(1, 0, 0), (2, 1, 1), (3, 2, 2), (4, 3, 3)] >>> session.query(tables[1]).all() [(1, 0, 0), (2, 1, 3), (3, 2, 6), (4, 3, 9)] >>> >>> # Test tables with a specified `rid` >>> populate(gen_data, engine, rid='rid', **kwargs) >>> Base.metadata.reflect(engine) >>> tables = Base.metadata.sorted_tables >>> session.query(tables[0]).all() [(1, 0, 0), (2, 1, 1), (3, 2, 2), (4, 3, 3)] >>> session.query(tables[1]).all() [(1, 0, 0), (2, 1, 3), (3, 2, 6), (4, 3, 9)] Returns str: The message """ log_level = logging.DEBUG if kwargs.get('DEBUG') else logging.INFO logger.setLevel(log_level) console_handler = logging.StreamHandler() logger.addHandler(console_handler) test = kwargs.get('TESTING') row_limit = kwargs.get('ROW_LIMIT') tables = kwargs.get('TABLES') chunk_size = min(row_limit or 'inf', kwargs.get('CHUNK_SIZE', row_limit)) engine.session = sessionmaker(engine)() dynamic = not tables if test: meta.create_all(engine) if dynamic: data = gen_data(**kwargs) tables = get_tables(data, kwargs['KEY']) result_func = partial(get_dynamic_res, engine, get_name, **kwargs) elif models: result_func = partial(res_from_models, models, **kwargs) else: result_func = partial(res_from_meta, engine, **kwargs) for t in tables: count = 0 data = data if dynamic else gen_data(**pr.merge([kwargs, t])) result = result_func(t, data=data) table, rid, data = result['table'], result['rid'], result['data'] table.name = table.__table__.name table.query = engine.session.query(table) del_count = delete_records(table, rid, engine) if del_count: logger.debug(get_message(del_count, table.name)) for records in ft.chunk(data, chunk_size): del_count, in_count = execute(records, engine, table, rid) count += in_count if del_count: logger.debug(get_message(del_count, table.name)) logger.debug(get_message(in_count, table.name, False)) if test: pprint(records) if row_limit and count >= row_limit: break logger.debug('Success! %s' % get_message(count, table.name, False))
def test_merge(self): expected = {'a': 1, 'b': 10, 'c': 11} result = pr.merge([{'a': 1, 'b': 2}, {'b': 10, 'c': 11}]) nt.assert_equal(expected, result) # setup records = [{'a': 1, 'b': 2, 'c': 3}, {'b': 4, 'c': 5, 'd': 6}] # Combine all keys expected = {u'a': 1, u'c': 8, u'b': 6, u'd': 6} result = pr.merge(records, pred=bool, op=sum) nt.assert_equal(expected, result) first = lambda pair: next(filter(partial(is_not, None), pair)) kwargs = {'pred': bool, 'op': first, 'default': None} expected = {u'a': 1, u'b': 2, u'c': 3, u'd': 6} result = pr.merge(records, **kwargs) nt.assert_equal(expected, result) # This will only reliably give the expected result for 2 records kwargs = {'pred': bool, 'op': stats.mean, 'default': None} expected = {u'a': 1, u'b': 3.0, u'c': 4.0, u'd': 6.0} result = pr.merge(records, **kwargs) nt.assert_equal(expected, result) # Only combine key 'b' expected = {u'a': 1, u'b': 6, u'c': 5, u'd': 6} result = pr.merge(records, pred='b', op=sum) nt.assert_equal(expected, result) # Only combine keys that have the same value of 'b' expected = {u'a': 1, u'b': 6, u'c': 5, u'd': 6} result = pr.merge(records, pred=itemgetter('b'), op=sum) nt.assert_equal(expected, result) # This will reliably work for any number of records counted = defaultdict(int) records = [ {'a': 1, 'b': 4, 'c': 0}, {'a': 2, 'b': 5, 'c': 2}, {'a': 3, 'b': 6, 'd': 7}] for r in records: for k in r.keys(): counted[k] += 1 expected = {u'a': 3, u'b': 3, u'c': 2, u'd': 1} nt.assert_equal(expected, counted) summed = pr.merge(records, pred=bool, op=sum) expected = {u'a': 6, u'b': 15, u'c': 2, u'd': 7} nt.assert_equal(expected, summed) kwargs = {'pred': bool, 'op': ft.fpartial(truediv)} expected = {u'a': 2.0, u'b': 5.0, u'c': 1.0, u'd': 7.0} result = pr.merge([summed, counted], **kwargs) nt.assert_equal(expected, result) # This should also reliably work for any number of records op = ft.fpartial(ft.sum_and_count) kwargs = {'pred': bool, 'op': op, 'default': None} merged = pr.merge(records, **kwargs) result = {x: truediv(*y) for x, y in merged.items()} nt.assert_equal(expected, result)