def partition_mapobjects(self, mapobject_type_name, n): '''Splits mapobjects into partitions of size `n`. Parameters ---------- mapobject_type_name: str name of the selected :class:`MapobjectType <tmlib.models.mapobject.MapobjectType>` n: int number of mapobjects per partition Returns ------- List[List[int]] mapobject IDs Note ---- Mapobjects are ordered by ID. ''' with tm.utils.ExperimentSession(self.experiment_id) as session: mapobject_type = session.query(tm.MapobjectType.id).\ filter_by(name=mapobject_type_name).\ one() mapobjects = session.query(tm.Mapobject.id).\ filter_by(mapobject_type_id=mapobject_type.id).\ order_by(tm.Mapobject.id).\ all() return create_partitions([m.id for m in mapobjects], n)
def parallelize_query(func, args): '''Parallelize database query. This can be useful for targeting different shards of a distributed table. The number of parallel connections depends on the value of :attr:`POOL_SIZE <tmlib.models.utils.POOL_SIZE>`. Parameters ---------- func: function a function that establishes a database connection and executes a given SQL query; function must return a list args: Union[list, generator] arguments that should be parsed to the function Returns ------- list aggregated output of `function` return values Warning ------- Don't use this function for distributed processing on the cluster, since this would establish too many simultaneous database connections. ''' logger.debug('execute query in %d parallel threads', POOL_SIZE) n = len(args) / POOL_SIZE arg_batches = create_partitions(args, n) output = [None] * len(arg_batches) def wrapper(func, args, index): output[index] = func(args) threads = [] for i, batch in enumerate(arg_batches): logger.debug('start thread #%d', i) # TODO: use queue or generator? t = Thread(target=wrapper, args=(func, batch, i)) # TODO: use Event t.start() threads.append(t) for t in threads: t.join() return list(chain(*output))
def _delete_cascade(cls, connection, mapobject_ids): logger.debug('delete mapobjects') # NOTE: Using ANY with an ARRAY is more performant than using IN. # TODO: Ideally we would like to join with mapobject_types. # However, at the moment there seems to be no way to DELETE entries # from a distributed table with a complex WHERE clause. # If the number of objects is too large this will lead to issues. # Therefore, we delete rows in batches. mapobject_id_partitions = create_partitions(mapobject_ids, 100000) # This will DELETE all records of referenced tables as well. # FIXME: How to cast to correct BigInteger type in $$ escaped query? sql = ''' DELETE FROM mapobjects WHERE id = ANY(%(mapobject_ids)s) ''' for mids in mapobject_id_partitions: connection.execute(_compile_distributed_query(sql), {'mapobject_ids': mids})
def _delete_cascade(cls, connection, mapobject_ids): logger.debug('delete mapobjects') # NOTE: Using ANY with an ARRAY is more performant than using IN. # TODO: Ideally we would like to join with mapobject_types. # However, at the moment there seems to be no way to DELETE entries # from a distributed table with a complex WHERE clause. # If the number of objects is too large this will lead to issues. # Therefore, we delete rows in batches. mapobject_id_partitions = create_partitions(mapobject_ids, 100000) # This will DELETE all records of referenced tables as well. # FIXME: How to cast to correct BigInteger type in $$ escaped query? sql = ''' DELETE FROM mapobjects WHERE id = ANY(%(mapobject_ids)s) ''' for mids in mapobject_id_partitions: connection.execute( _compile_distributed_query(sql), {'mapobject_ids': mids} )
def _create_batches(li, n): return utils.create_partitions(li, n)