def test_insert(): from kvfile import KVFile kv = KVFile() kv.insert(((str(i), ':{}'.format(i)) for i in range(50000))) assert len(list(kv.keys())) == 50000 assert len(list(kv.items())) == 50000 assert kv.get('49999') == ':49999' kv.insert(((str(i), ':{}'.format(i)) for i in range(50000, 100000)), batch_size=40000) assert len(list(kv.items())) == 100000 kv.insert(((str(i), ':{}'.format(i)) for i in range(100000, 100002)), batch_size=1) kv.insert(((str(i), ':{}'.format(i)) for i in range(100002, 100005)), batch_size=0) assert len(list(kv.items())) == 100005
def _sorter(rows, key_calc, reverse, batch_size): db = KVFile() db.insert(((key_calc(row) + "{:08x}".format(row_num), row) for row_num, row in enumerate(rows)), batch_size=batch_size) for _, value in db.items(reverse=reverse): yield value
def _sorter(rows, key_calc, reverse, batch_size): db = KVFile() def process(rows): for row_num, row in enumerate(rows): key = key_calc(row) + '{:08x}'.format(row_num) yield (key, row) db.insert(process(rows), batch_size=batch_size) for _, value in db.items(reverse=reverse): yield value
def test_filename(): from kvfile import KVFile, db_kind filename = 'bla.filename.' + db_kind + '.db' kv1 = KVFile(filename=filename) kv1.insert(((str(i), ':{}'.format(i)) for i in range(50000))) del kv1 kv = KVFile(filename=filename) assert len(list(kv.keys())) == 50000 assert len(list(kv.items())) == 50000 assert kv.get('49999') == ':49999'
def test_insert_generator(): from kvfile import KVFile kv = KVFile() data = [(str(i), ':{}'.format(i)) for i in range(50)] expected_data = [] for key, value in kv.insert_generator(data): expected_data.append((key, value)) assert data == expected_data assert len(list(kv.keys())) == 50 assert len(list(kv.items())) == 50 assert kv.get('49') == ':49'
def test_sanity(): from kvfile import KVFile kv = KVFile() data = dict(s='value', i=123, d=datetime.datetime.fromtimestamp(12325), n=decimal.Decimal('1234.56'), ss=set(range(10)), o=dict(d=decimal.Decimal('1234.58'), n=datetime.datetime.fromtimestamp(12325))) for k, v in data.items(): kv.set(k, v) for k, v in data.items(): assert kv.get(k) == v assert list(kv.keys()) == sorted(data.keys()) assert list(kv.items()) == sorted(data.items()) assert list(kv.keys(reverse=True)) == sorted(data.keys(), reverse=True) assert list(kv.items(reverse=True)) == sorted(data.items(), reverse=True)
def _get_resource(self, last_update_resource=None): last_kvfile, last_update, key_fields, incremental_field = None, None, None, None if last_update_resource is not None: last_kvfile = KVFile() key_fields = self._parameters.get('incremental-field-key', [self._primary_key_field_name]) incremental_field = self._parameters['incremental-field'] for row in last_update_resource: key = '-'.join([str(row[k]) for k in key_fields]) try: last_row = last_kvfile.get(key) except KeyError: last_row = None if not last_row or last_row[incremental_field] < row[ incremental_field]: last_kvfile.set(key, dict(row)) if not last_update or last_update < row[incremental_field]: last_update = row[incremental_field] if last_update: logging.info('last_update={}'.format(last_update)) resources_yielded = 0 with utils.temp_loglevel(): logging.info( "Loading dataservice resource from service {} method {}". format(self._parameters["service-name"], self._parameters["method-name"])) # with process_metrics('dataservice_collection_row', # {'service_name': self._parameters['service-name'], # 'method_name': self._parameters['method-name']}) as send_process_metrics: if last_update: if self._parameters.get('incremental-field-type') == 'integer': last_update_str = last_update else: last_update_str = ( last_update - datetime.timedelta(days=1)).strftime('%Y-%m-%d') since_last_update = (self._parameters['incremental-field'], last_update_str, self._parameters.get( 'incremental-field-type', 'datetime')) else: since_last_update = None for dataservice_object in self.dataservice_class.get_all( since_last_update=since_last_update): row = self._filter_dataservice_object(dataservice_object) if os.environ.get( "OVERRIDE_DATASERVICE_COLLECTION_LIMIT_ITEMS", ""): if int( os.environ.get( "OVERRIDE_DATASERVICE_COLLECTION_LIMIT_ITEMS", "")) < resources_yielded: return for k in row: for field in self._schema["fields"]: if field["name"] == k: if field["type"] == "integer" and row[ k] is not None: row[k] = int(row[k]) if last_update: key = '-'.join([str(row[k]) for k in key_fields]) last_kvfile.set(key, dict(row)) else: resources_yielded += 1 yield row # send_process_metrics() if resources_yielded > 0 and resources_yielded % 10000 == 0: logging.info("Loaded {} dataservice objects".format( resources_yielded)) if last_update: for key, row in last_kvfile.items(): resources_yielded += 1 yield row if resources_yielded % 10000 == 0: logging.info("Loaded {} dataservice objects".format( resources_yielded))