def setUp(self): self.client = mock.Mock() self.client.batch_write_item.return_value = {'UnprocessedItems': {}} self.table_name = 'tablename' self.flush_amount = 2 self.batch_writer = BatchWriter(self.table_name, self.client, self.flush_amount)
def massive_insert(self): sizes = [10, 100, 1000, 10000, 100000, 1000000] for size in sizes: self.create_table_for_massive_test(size) print("time;nb of writers;inserted items") table = self.dynamodb.Table('test_earth_input_big_table_%s' % size) batch = BatchWriter(table_name='test_earth_input_big_table_%s' % size, client=table.meta.client, flush_amount=25) t0 = time.time() for i in range(0, size): item = {'serie_name': 'MYSERIE', 'date': str(i), 'value': i} batch.put_item(Item=item) total_time = time.time() - t0 print("%s;%s;%s" % (total_time, self.writers_number, i))
def batch_writer(self, overwrite_by_pkeys=None): """Create a batch writer object. https://boto3.readthedocs.io/en/latest/reference/services/dynamodb.html#DynamoDB.Table.batch_writer :type overwrite_by_pkeys: list(string) :param overwrite_by_pkeys: De-duplicate request items in buffer if match new request item on specified primary keys. i.e ``["partition_key1", "sort_key2", "sort_key3"]`` """ encrypted_client = EncryptedClient( client=self._table.meta.client, materials_provider=self._materials_provider, attribute_actions=self._attribute_actions, auto_refresh_table_indexes=self._auto_refresh_table_indexes, expect_standard_dictionaries=True) return BatchWriter(table_name=self._table.name, client=encrypted_client, overwrite_by_pkeys=overwrite_by_pkeys)
def test_auto_dedup_for_dup_requests(self): with BatchWriter( self.table_name, self.client, flush_amount=5, overwrite_by_pkeys=["pkey", "skey"], ) as b: # dup 1 b.put_item(Item={ 'pkey': 'foo1', 'skey': 'bar1', 'other': 'other1' }) b.put_item(Item={ 'pkey': 'foo1', 'skey': 'bar1', 'other': 'other2' }) # dup 2 b.delete_item(Key={ 'pkey': 'foo1', 'skey': 'bar2', }) b.put_item(Item={ 'pkey': 'foo1', 'skey': 'bar2', 'other': 'other3' }) # dup 3 b.put_item(Item={ 'pkey': 'foo2', 'skey': 'bar2', 'other': 'other3' }) b.delete_item(Key={ 'pkey': 'foo2', 'skey': 'bar2', }) # dup 4 b.delete_item(Key={ 'pkey': 'foo2', 'skey': 'bar3', }) b.delete_item(Key={ 'pkey': 'foo2', 'skey': 'bar3', }) # 5 b.delete_item(Key={ 'pkey': 'foo3', 'skey': 'bar3', }) # 2nd batch b.put_item(Item={ 'pkey': 'foo1', 'skey': 'bar1', 'other': 'other1' }) b.put_item(Item={ 'pkey': 'foo1', 'skey': 'bar1', 'other': 'other2' }) first_batch = { 'RequestItems': { self.table_name: [ { 'PutRequest': { 'Item': { 'pkey': 'foo1', 'skey': 'bar1', 'other': 'other2', } } }, { 'PutRequest': { 'Item': { 'pkey': 'foo1', 'skey': 'bar2', 'other': 'other3', } } }, { 'DeleteRequest': { 'Key': { 'pkey': 'foo2', 'skey': 'bar2', } } }, { 'DeleteRequest': { 'Key': { 'pkey': 'foo2', 'skey': 'bar3', } } }, { 'DeleteRequest': { 'Key': { 'pkey': 'foo3', 'skey': 'bar3', } } }, ] } } second_batch = { 'RequestItems': { self.table_name: [ { 'PutRequest': { 'Item': { 'pkey': 'foo1', 'skey': 'bar1', 'other': 'other2', } } }, ] } } self.assert_batch_write_calls_are([first_batch, second_batch])
def test_repeated_flushing_on_exit(self): # We're going to simulate unprocessed_items # returning multiple unprocessed items across calls. self.client.batch_write_item.side_effect = [ { 'UnprocessedItems': { self.table_name: [ { 'PutRequest': { 'Item': { 'Hash': 'foo2' } } }, { 'PutRequest': { 'Item': { 'Hash': 'foo3' } } }, ], }, }, { 'UnprocessedItems': { self.table_name: [ { 'PutRequest': { 'Item': { 'Hash': 'foo3' } } }, ], }, }, { 'UnprocessedItems': {} }, ] with BatchWriter(self.table_name, self.client, flush_amount=4) as b: b.put_item(Item={'Hash': 'foo1'}) b.put_item(Item={'Hash': 'foo2'}) b.put_item(Item={'Hash': 'foo3'}) # So when we exit, we expect three calls. # First we try the normal batch write with 3 items: first_batch = { 'RequestItems': { self.table_name: [ { 'PutRequest': { 'Item': { 'Hash': 'foo1' } } }, { 'PutRequest': { 'Item': { 'Hash': 'foo2' } } }, { 'PutRequest': { 'Item': { 'Hash': 'foo3' } } }, ] } } # Then we see two unprocessed items so we send another batch. second_batch = { 'RequestItems': { self.table_name: [ { 'PutRequest': { 'Item': { 'Hash': 'foo2' } } }, { 'PutRequest': { 'Item': { 'Hash': 'foo3' } } }, ] } } # And then we still see one more unprocessed item so # we need to send another batch. third_batch = { 'RequestItems': { self.table_name: [ { 'PutRequest': { 'Item': { 'Hash': 'foo3' } } }, ] } } self.assert_batch_write_calls_are( [first_batch, second_batch, third_batch])
def test_never_send_more_than_max_batch_size(self): # Suppose the server sends backs a response that indicates that # all the items were unprocessed. self.client.batch_write_item.side_effect = [ { 'UnprocessedItems': { self.table_name: [ { 'PutRequest': { 'Item': { 'Hash': 'foo1' } } }, { 'PutRequest': { 'Item': { 'Hash': 'foo2' } } }, ], }, }, { 'UnprocessedItems': { self.table_name: [ { 'PutRequest': { 'Item': { 'Hash': 'foo2' } } }, ], }, }, { 'UnprocessedItems': {} }, ] with BatchWriter(self.table_name, self.client, flush_amount=2) as b: b.put_item(Item={'Hash': 'foo1'}) b.put_item(Item={'Hash': 'foo2'}) b.put_item(Item={'Hash': 'foo3'}) # Note how we're never sending more than flush_amount=2. first_batch = { 'RequestItems': { self.table_name: [ { 'PutRequest': { 'Item': { 'Hash': 'foo1' } } }, { 'PutRequest': { 'Item': { 'Hash': 'foo2' } } }, ] } } # Even when the server sends us unprocessed items of 2 elements, # we'll still only send 2 at a time, in order. second_batch = { 'RequestItems': { self.table_name: [ { 'PutRequest': { 'Item': { 'Hash': 'foo1' } } }, { 'PutRequest': { 'Item': { 'Hash': 'foo2' } } }, ] } } # And then we still see one more unprocessed item so # we need to send another batch. third_batch = { 'RequestItems': { self.table_name: [ { 'PutRequest': { 'Item': { 'Hash': 'foo3' } } }, { 'PutRequest': { 'Item': { 'Hash': 'foo2' } } }, ] } } self.assert_batch_write_calls_are( [first_batch, second_batch, third_batch])