def _create_data_stream(self, data): """ Creates the data stream in case of object processing """ extra_get_args = {} if 'url' in data: url = data['url'] logger.info('Getting dataset from {}'.format(url.path)) if url.data_byte_range is not None: range_str = 'bytes={}-{}'.format(*url.data_byte_range) extra_get_args['Range'] = range_str logger.info('Chunk: {} - Range: {}'.format(url.part, extra_get_args['Range'])) resp = requests.get(url.path, headers=extra_get_args, stream=True) url.data_stream = resp.raw if 'obj' in data: obj = data['obj'] obj.storage_backend storage_handler = Storage(self.pywren_config, obj.storage_backend).get_storage_handler() logger.info('Getting dataset from {}://{}/{}'.format(obj.storage_backend, obj.bucket, obj.key)) if obj.data_byte_range is not None: extra_get_args['Range'] = 'bytes={}-{}'.format(*obj.data_byte_range) logger.info('Chunk: {} - Range: {}'.format(obj.part, extra_get_args['Range'])) sb = storage_handler.get_object(obj.bucket, obj.key, stream=True, extra_get_args=extra_get_args) obj.data_stream = WrappedStreamingBodyPartition(sb, obj.chunk_size, obj.data_byte_range) else: obj.data_stream = storage_handler.get_object(obj.bucket, obj.key, stream=True)
def _fill_optional_args(self, function, data): """ Fills in those reserved, optional parameters that might be write to the function signature """ func_sig = inspect.signature(function) if 'ibm_cos' in func_sig.parameters: if 'ibm_cos' in self.pywren_config: if self.internal_storage.backend == 'ibm_cos': ibm_boto3_client = self.internal_storage.get_client() else: ibm_boto3_client = Storage(self.pywren_config, 'ibm_cos').get_client() data['ibm_cos'] = ibm_boto3_client else: raise Exception('Cannot create the ibm_cos client: missing configuration') if 'storage' in func_sig.parameters: data['storage'] = self.internal_storage.get_client() if 'rabbitmq' in func_sig.parameters: if 'rabbitmq' in self.pywren_config: rabbit_amqp_url = self.pywren_config['rabbitmq'].get('amqp_url') params = pika.URLParameters(rabbit_amqp_url) connection = pika.BlockingConnection(params) data['rabbitmq'] = connection else: raise Exception('Cannot create the rabbitmq client: missing configuration') if 'id' in func_sig.parameters: data['id'] = int(self.call_id)
def _load_object(self, data): """ Loads the object in /tmp in case of object processing """ extra_get_args = {} if 'url' in data: url = data['url'] logger.info('Getting dataset from {}'.format(url.path)) if url.data_byte_range is not None: range_str = 'bytes={}-{}'.format(*url.data_byte_range) extra_get_args['Range'] = range_str logger.info('Chunk: {} - Range: {}'.format( url.part, extra_get_args['Range'])) resp = requests.get(url.path, headers=extra_get_args, stream=True) url.data_stream = resp.raw if 'obj' in data: obj = data['obj'] logger.info('Getting dataset from {}://{}/{}'.format( obj.backend, obj.bucket, obj.key)) if obj.backend == self.internal_storage.backend: storage = self.internal_storage.storage else: storage = Storage(pywren_config=self.pywren_config, storage_backend=obj.backend) if obj.data_byte_range is not None: extra_get_args['Range'] = 'bytes={}-{}'.format( *obj.data_byte_range) logger.info('Chunk: {} - Range: {}'.format( obj.part, extra_get_args['Range'])) sb = storage.get_object(obj.bucket, obj.key, stream=True, extra_get_args=extra_get_args) wsb = WrappedStreamingBodyPartition(sb, obj.chunk_size, obj.data_byte_range) obj.data_stream = wsb else: sb = storage.get_object(obj.bucket, obj.key, stream=True, extra_get_args=extra_get_args) obj.data_stream = sb
def _fill_optional_args(self, function, data): """ Fills in those reserved, optional parameters that might be write to the function signature """ func_sig = inspect.signature(function) if 'ibm_cos' in func_sig.parameters: if 'ibm_cos' in self.pywren_config: try: ibm_boto3_client = Storage(self.storage_config, 'ibm_cos').get_client() data['ibm_cos'] = ibm_boto3_client except Exception as e: logger.error('Cannot create the ibm_cos connection: {}', str(e)) data['ibm_cos'] = None else: logger.error( 'Cannot create the ibm_cos connection: Configuration not provided' ) data['ibm_cos'] = None if 'internal_storage' in func_sig.parameters: data['internal_storage'] = self.internal_storage if 'rabbitmq' in func_sig.parameters: if 'rabbitmq' in self.pywren_config: try: rabbit_amqp_url = self.pywren_config['rabbitmq'].get( 'amqp_url') params = pika.URLParameters(rabbit_amqp_url) connection = pika.BlockingConnection(params) data['rabbitmq'] = connection except Exception as e: logger.error('Cannot create the rabbitmq connection: {}', str(e)) data['rabbitmq'] = None else: logger.error( 'Cannot create the rabbitmq connection: Configuration not provided' ) data['rabbitmq'] = None if 'id' in func_sig.parameters: data['id'] = int(self.call_id)
def create_partitions(pywren_config, map_iterdata, chunk_size, chunk_number): """ Method that returns the function that will create the partitions of the objects in the Cloud """ logger.debug('Starting partitioner') parts_per_object = None sbs = set() buckets = set() prefixes = set() obj_names = set() urls = set() logger.debug("Parsing input data") for elem in map_iterdata: if 'url' in elem: urls.add(elem['url']) elif 'obj' in elem: sb, bucket, prefix, obj_name = utils.split_object_url(elem['obj']) if obj_name: obj_names.add((bucket, prefix)) elif prefix: prefixes.add((bucket, prefix)) else: buckets.add(bucket) sbs.add(sb) if len(sbs) > 1: raise Exception( 'Currently we only support to process one storage backend at a time. ' 'Current storage backends: {}'.format(sbs)) if [prefixes, obj_names, urls, buckets].count(True) > 1: raise Exception( 'You must provide as an input data a list of bucktes, ' 'a list of buckets with object prefix, a list of keys ' 'or a list of urls. Intermingled types are not allowed.') if not urls: # process objects from an object store. No url sb = sbs.pop() storage_handler = Storage(pywren_config, sb).get_storage_handler() objects = {} if obj_names: for bucket, prefix in obj_names: logger.debug("Listing objects in '{}://{}'".format( sb, '/'.join([bucket, prefix]))) if bucket not in objects: objects[bucket] = [] objects[bucket].extend( storage_handler.list_objects(bucket, prefix)) elif prefixes: for bucket, prefix in prefixes: logger.debug("Listing objects in '{}://{}'".format( sb, '/'.join([bucket, prefix]))) if bucket not in objects: objects[bucket] = [] objects[bucket].extend( storage_handler.list_objects(bucket, prefix)) elif buckets: for bucket in buckets: logger.debug("Listing objects in '{}://{}'".format(sb, bucket)) objects[bucket] = storage_handler.list_objects(bucket) keys_dict = {} for bucket in objects: keys_dict[bucket] = {} for obj in objects[bucket]: keys_dict[bucket][obj['Key']] = obj['Size'] if buckets or prefixes: partitions, parts_per_object = _split_objects_from_buckets( map_iterdata, keys_dict, chunk_size, chunk_number) elif obj_names: partitions, parts_per_object = _split_objects_from_keys( map_iterdata, keys_dict, chunk_size, chunk_number) elif urls: partitions, parts_per_object = _split_objects_from_urls( map_iterdata, chunk_size, chunk_number) else: raise ValueError('You did not provide any bucket or object key/url') return partitions, parts_per_object