def test_default(): from kvfile import KVFile kv = KVFile() kv.set('aaaa', 5) assert kv.get('aaaa') == 5 assert kv.get('bbbb', default=6) == 6 with pytest.raises(KeyError): kv.get('bbbb')
def get_all_existing_ids(connection_string, db_table, key_fields, db_status_fields): db_fields = key_fields + db_status_fields stmt = ' '.join(['select', ','.join(db_fields), 'from', db_table]) engine = create_engine(connection_string) ret = KVFile() try: rows = engine.execute(stmt) for row in rows: rec = dict(zip(db_fields, row)) existing_id = dict( (k, v) for k, v in rec.items() if k in db_status_fields) key = calc_key(rec, key_fields) ret.set(key, existing_id) except ProgrammingError: print('WARNING: Failed to fetch existing keys') except OperationalError: print('WARNING: Failed to fetch existing keys') return ret
def test_sanity(): from kvfile import KVFile kv = KVFile() data = dict( s='value', i=123, d=datetime.datetime.fromtimestamp(12325), n=decimal.Decimal('1234.56'), ss=set(range(10)), o=dict(d=decimal.Decimal('1234.58'), n=datetime.datetime.fromtimestamp(12325)) ) for k, v in data.items(): kv.set(k, v) for k, v in data.items(): assert kv.get(k) == v assert sorted(kv.keys()) == sorted(data.keys()) assert sorted(kv.items()) == sorted(data.items())
def flow(parameters, *_): stats = defaultdict(int) kv = KVFile() last_id = None load_from = parameters.get("load_from", parameters.get('dump_to_path')) if load_from and os.path.exists(os.path.join(load_from, "datapackage.json")): logging.info("Loading from last load_from_db package: " + os.path.join(load_from, "datapackage.json")) row = None for resource in Flow( load(os.path.join(load_from, "datapackage.json"), limit_rows=parameters.get("limit_rows"), resources="db_data")).datastream().res_iter: for row in resource: stats['loaded from package'] += 1 last_id = row['__id'] kv.set("{:0>12}".format(last_id), row) if last_id % 10000 == 0: logging.info("Loaded id: %s" % last_id) all_data_keys = set(row.keys()) if row else set() else: all_data_keys = set() logging.info('num rows loaded from package: %s' % stats['loaded from package']) engine = create_engine( "postgresql://{username}:{password}@{host}:5432/reports?sslmode=verify-ca&sslrootcert={sslrootcert}&sslcert={sslcert}&sslkey={sslkey}" .format(**config.db_settings)) engine.update_execution_options(stream_results=True) if parameters.get("where"): logging.info("Loading from DB, with where clause: " + parameters["where"]) where = " where " + parameters["where"] elif last_id: logging.info("Loading from DB, starting at id %s" % last_id) where = " where id > %s" % last_id else: logging.info("Loading all records from DB") where = "" for id, created, data in engine.execute( "select id, created, data from reports%s order by id" % where): if parameters.get("filter_db_row_callback"): id, created, data = parameters["filter_db_row_callback"](id, created, data) if not data or not isinstance(data, dict): stats['invalid data'] += 1 continue stats['loaded from db'] += 1 last_id = id row = { "__id": id, "__created": created, } for k, v in data.items(): all_data_keys.add(k) row[k] = v kv.set("{:0>12}".format(id), row) if id % 100000 == 0: logging.info("Loaded id: %s" % id) if parameters.get("limit_rows") and stats[ 'loaded from db'] > parameters["limit_rows"]: break logging.info('DB rows with invalid data: %s' % stats['invalid data']) logging.info("last_id = %s" % last_id) logging.info('num rows loaded from db: %s' % stats['loaded from db']) def _yield_from_kv(): for _, row in kv.items(): yield { "__id": row["__id"], "__created": row["__created"], **{ k: json.dumps(row.get(k)) for k in all_data_keys if k not in ["__id", "__created"] } } flow_args = [ _yield_from_kv(), update_resource( -1, name="db_data", path="db_data.csv", schema={ "fields": [{ "name": "__id", "type": "integer" }, { "name": "__created", "type": "datetime" }, *[{ "name": k, "type": "string" } for k in all_data_keys if k not in ["__id", "__created"]]] }, **{"dpp:streaming": True}), ] if parameters.get("dump_to_path"): flow_args += [dump_to_path(parameters['dump_to_path'])] return Flow(*flow_args)
def _get_resource(self, last_update_resource=None): last_kvfile, last_update, key_fields, incremental_field = None, None, None, None if last_update_resource is not None: last_kvfile = KVFile() key_fields = self._parameters.get('incremental-field-key', [self._primary_key_field_name]) incremental_field = self._parameters['incremental-field'] for row in last_update_resource: key = '-'.join([str(row[k]) for k in key_fields]) try: last_row = last_kvfile.get(key) except KeyError: last_row = None if not last_row or last_row[incremental_field] < row[ incremental_field]: last_kvfile.set(key, dict(row)) if not last_update or last_update < row[incremental_field]: last_update = row[incremental_field] if last_update: logging.info('last_update={}'.format(last_update)) resources_yielded = 0 with utils.temp_loglevel(): logging.info( "Loading dataservice resource from service {} method {}". format(self._parameters["service-name"], self._parameters["method-name"])) # with process_metrics('dataservice_collection_row', # {'service_name': self._parameters['service-name'], # 'method_name': self._parameters['method-name']}) as send_process_metrics: if last_update: if self._parameters.get('incremental-field-type') == 'integer': last_update_str = last_update else: last_update_str = ( last_update - datetime.timedelta(days=1)).strftime('%Y-%m-%d') since_last_update = (self._parameters['incremental-field'], last_update_str, self._parameters.get( 'incremental-field-type', 'datetime')) else: since_last_update = None for dataservice_object in self.dataservice_class.get_all( since_last_update=since_last_update): row = self._filter_dataservice_object(dataservice_object) if os.environ.get( "OVERRIDE_DATASERVICE_COLLECTION_LIMIT_ITEMS", ""): if int( os.environ.get( "OVERRIDE_DATASERVICE_COLLECTION_LIMIT_ITEMS", "")) < resources_yielded: return for k in row: for field in self._schema["fields"]: if field["name"] == k: if field["type"] == "integer" and row[ k] is not None: row[k] = int(row[k]) if last_update: key = '-'.join([str(row[k]) for k in key_fields]) last_kvfile.set(key, dict(row)) else: resources_yielded += 1 yield row # send_process_metrics() if resources_yielded > 0 and resources_yielded % 10000 == 0: logging.info("Loaded {} dataservice objects".format( resources_yielded)) if last_update: for key, row in last_kvfile.items(): resources_yielded += 1 yield row if resources_yielded % 10000 == 0: logging.info("Loaded {} dataservice objects".format( resources_yielded))