Пример #1
0
def clean_functions(functions_data):
    file_location = functions_data['file_location']
    data = functions_data['data']

    executor_id = data['fn_to_clean']
    logger.info(f'Going to clean functions from {executor_id}')
    storage_config = data['storage_config']
    storage = Storage(storage_config=storage_config)
    prefix = '/'.join([JOBS_PREFIX, executor_id]) + '/'
    key_list = storage.list_keys(storage.bucket, prefix)
    storage.delete_objects(storage.bucket, key_list)

    if os.path.exists(file_location):
        os.remove(file_location)
    logger.info('Finished')
Пример #2
0
    def _fill_optional_args(self, function, data):
        """
        Fills in those reserved, optional parameters that might be write to the function signature
        """
        func_sig = inspect.signature(function)

        if 'ibm_cos' in func_sig.parameters:
            if 'ibm_cos' in self.lithops_config:
                if self.internal_storage.backend == 'ibm_cos':
                    ibm_boto3_client = self.internal_storage.get_client()
                else:
                    ibm_boto3_client = Storage(
                        lithops_config=self.lithops_config,
                        storage_backend='ibm_cos').get_client()
                data['ibm_cos'] = ibm_boto3_client
            else:
                raise Exception(
                    'Cannot create the ibm_cos client: missing configuration')

        if 'storage' in func_sig.parameters:
            data['storage'] = self.internal_storage.storage

        if 'rabbitmq' in func_sig.parameters:
            if 'rabbitmq' in self.lithops_config:
                rabbit_amqp_url = self.lithops_config['rabbitmq'].get(
                    'amqp_url')
                params = pika.URLParameters(rabbit_amqp_url)
                connection = pika.BlockingConnection(params)
                data['rabbitmq'] = connection
            else:
                raise Exception(
                    'Cannot create the rabbitmq client: missing configuration')

        if 'id' in func_sig.parameters:
            data['id'] = int(self.call_id)
Пример #3
0
def run_tests(test_to_run, config=None, mode=None, backend=None, storage=None):
    global CONFIG, STORAGE_CONFIG, STORAGE

    mode = mode or get_mode(backend, config)
    config_ow = {'lithops': {'mode': mode}}
    if storage:
        config_ow['lithops']['storage'] = storage
    if backend:
        config_ow[mode] = {'backend': backend}
    CONFIG = default_config(config, config_ow)

    STORAGE_CONFIG = extract_storage_config(CONFIG)
    STORAGE = Storage(storage_config=STORAGE_CONFIG)

    suite = unittest.TestSuite()
    if test_to_run == 'all':
        suite.addTest(unittest.makeSuite(TestLithops))
    else:
        try:
            suite.addTest(TestLithops(test_to_run))
        except ValueError:
            print("unknown test, use: --help")
            sys.exit()

    runner = unittest.TextTestRunner()
    runner.run(suite)
Пример #4
0
def run_tests(tests, config=None, group=None, backend=None, storage=None, fail_fast=False,
              keep_datasets=False):
    global CONFIG, STORAGE_CONFIG, STORAGE

    config_ow = {'lithops': {}}
    if storage:
        config_ow['lithops']['storage'] = storage
    if backend:
        config_ow['lithops']['backend'] = backend

    CONFIG = default_config(config, config_ow)
    STORAGE_CONFIG = extract_storage_config(CONFIG)
    STORAGE = Storage(storage_config=STORAGE_CONFIG)
    init_test_variables()

    suite = unittest.TestSuite()
    config_suite(suite, tests, group)
    words_in_data_set = upload_data_sets()  # uploads datasets and returns word count
    main_util.init_config(CONFIG, STORAGE, STORAGE_CONFIG, words_in_data_set, TEST_FILES_URLS)

    runner = unittest.TextTestRunner(verbosity=2, failfast=fail_fast)
    tests_results = runner.run(suite)

    # removes previously uploaded datasets from storage.
    if not keep_datasets:
        clean_tests(STORAGE, STORAGE_CONFIG, PREFIX)

    if not tests_results.wasSuccessful():  # Fails github workflow action to reject merge to repository
        sys.tracebacklimit = 0  # avoid displaying redundant stack track-back info
        raise Exception("--------Test procedure failed. Merge rejected--------")
Пример #5
0
    def _load_object(self, data):
        """
        Loads the object in /tmp in case of object processing
        """
        extra_get_args = {}

        if 'url' in data:
            url = data['url']
            logger.info('Getting dataset from {}'.format(url.path))
            if url.data_byte_range is not None:
                range_str = 'bytes={}-{}'.format(*url.data_byte_range)
                extra_get_args['Range'] = range_str
                logger.info('Chunk: {} - Range: {}'.format(
                    url.part, extra_get_args['Range']))
            resp = requests.get(url.path, headers=extra_get_args, stream=True)
            url.data_stream = resp.raw

        if 'obj' in data:
            obj = data['obj']
            logger.info('Getting dataset from {}://{}/{}'.format(
                obj.backend, obj.bucket, obj.key))

            if obj.backend == self.internal_storage.backend:
                storage = self.internal_storage.storage
            else:
                storage = Storage(lithops_config=self.lithops_config,
                                  storage_backend=obj.backend)

            if obj.data_byte_range is not None:
                extra_get_args['Range'] = 'bytes={}-{}'.format(
                    *obj.data_byte_range)
                logger.info('Chunk: {} - Range: {}'.format(
                    obj.part, extra_get_args['Range']))
                sb = storage.get_object(obj.bucket,
                                        obj.key,
                                        stream=True,
                                        extra_get_args=extra_get_args)
                wsb = WrappedStreamingBodyPartition(sb, obj.chunk_size,
                                                    obj.data_byte_range)
                obj.data_stream = wsb
            else:
                sb = storage.get_object(obj.bucket,
                                        obj.key,
                                        stream=True,
                                        extra_get_args=extra_get_args)
                obj.data_stream = sb
Пример #6
0
def clean_cloudobjects(cloudobjects_data):
    file_location = cloudobjects_data['file_location']
    data = cloudobjects_data['data']

    logger.info('Going to clean cloudobjects')
    cos_to_clean = data['cos_to_clean']
    storage_config = data['storage_config']
    storage = Storage(storage_config=storage_config)

    for co in cos_to_clean:
        if co.backend == storage.backend:
            logging.info('Cleaning {}://{}/{}'.format(co.backend, co.bucket,
                                                      co.key))
            storage.delete_object(co.bucket, co.key)

    if os.path.exists(file_location):
        os.remove(file_location)
    logger.info('Finished')
Пример #7
0
    def __init__(self, ds_config, db_config, use_db_cache=True, use_ds_cache=True, hybrid_impl='auto'):

        self.config = default_config()
        self.ds_config = ds_config
        self.db_config = db_config
        self.use_db_cache = use_db_cache
        self.use_ds_cache = use_ds_cache
        if hybrid_impl == 'auto':
            self.hybrid_impl = (
                self.config['lithops']['mode'] == 'localhost'
                or self.config['lithops']['mode'] == 'serverless' and 'ibm_vpc' in self.config
            )
            if self.hybrid_impl:
                logger.info(f'Using the Hybrid implementation')
            else:
                logger.info(f'Using the pure Serverless implementation')
        else:
            self.hybrid_impl = hybrid_impl

        lithops_bucket = self.config['lithops']['storage_bucket']
        self.ds_bucket = self.config.get('storage', {}).get('ds_bucket', lithops_bucket)

        self.lithops_executor = lithops.FunctionExecutor(config=self.config, runtime_memory=2048)
        if self.hybrid_impl:
            if self.config['lithops']['mode'] == 'localhost':
                self.lithops_vm_executor = self.lithops_executor
            else:
                self.lithops_vm_executor = lithops.StandaloneExecutor(config=self.config)

        self.storage = Storage(config=self.config)

        cache_namespace = 'vm' if hybrid_impl else 'function'
        self.cacher = PipelineCacher(
            self.storage, lithops_bucket, cache_namespace, self.ds_config["name"], self.db_config["name"]
        )
        if not self.use_db_cache or not self.use_ds_cache:
            self.cacher.clean(database=not self.use_db_cache, dataset=not self.use_ds_cache)

        stats_path_cache_key = ':ds/:db/stats_path.cache'
        if self.cacher.exists(stats_path_cache_key):
            self.stats_path = self.cacher.load(stats_path_cache_key)
            PipelineStats.path = self.stats_path
            logger.info(f'Using cached {self.stats_path} for statistics')
        else:
            PipelineStats.init()
            self.stats_path = PipelineStats.path
            self.cacher.save(self.stats_path, stats_path_cache_key)
            logger.info(f'Initialised {self.stats_path} for statistics')

        self.ds_segm_size_mb = 128
        self.image_gen_config = {
            "q": 99,
            "do_preprocessing": False,
            "nlevels": 30,
            "ppm": 3.0
        }
Пример #8
0
    def __init__(
        self,
        imzml_file: str,
        ibd_file: str,
        moldb_files: Union[List[int], List[str]],
        ds_config: DSConfig,
        sm_config: Optional[Dict] = None,
        use_cache=True,
        out_dir: Optional[str] = None,
        executor: Optional[Executor] = None,
    ):
        sm_config = sm_config or SMConfig.get_conf()
        self.storage = Storage(config=sm_config['lithops'])
        sm_storage = sm_config['lithops']['sm_storage']

        self.imzml_cobj = _upload_if_needed(imzml_file,
                                            self.storage,
                                            sm_storage,
                                            'imzml',
                                            use_db_mutex=False)
        self.ibd_cobj = _upload_if_needed(ibd_file,
                                          self.storage,
                                          sm_storage,
                                          'imzml',
                                          use_db_mutex=False)
        if isinstance(moldb_files[0], int):
            self.moldb_defs = _upload_moldbs_from_db(moldb_files, self.storage,
                                                     sm_storage)
        else:
            self.moldb_defs = _upload_moldbs_from_files(
                moldb_files, self.storage, sm_storage)
        self.ds_config = ds_config
        self.out_dir = Path(out_dir) if out_dir else Path('./result_pngs')

        if use_cache:
            cache_key: Optional[str] = jsonhash({
                'imzml': imzml_file,
                'ibd': ibd_file,
                'dbs': moldb_files,
                'ds': ds_config
            })
        else:
            cache_key = None

        self.pipe = Pipeline(
            self.imzml_cobj,
            self.ibd_cobj,
            self.moldb_defs,
            self.ds_config,
            executor=executor,
            cache_key=cache_key,
            use_db_cache=use_cache,
            use_db_mutex=False,
            lithops_config=sm_config['lithops'],
        )
Пример #9
0
 def batch_copy(src_cobjs: List[CloudObject], dest_prefix: str, *,
                storage: Storage):
     # If Lithops' storage supported Copy Object operations, this could be easily optimized.
     # Not sure if it's worth the effort yet
     result_cobjs = []
     for i, data in enumerate(
             iter_cobjects_with_prefetch(storage, src_cobjs)):
         dest_key = f'{dest_prefix}/{i:06}'
         result_cobjs.append(
             storage.put_cloudobject(data, dest_bucket, dest_key))
     return result_cobjs
Пример #10
0
def find_shared_objects(calls):
    # find and annotate repeated arguments
    record = {}
    for i, call in enumerate(calls):
        for j, arg in enumerate(call[1]):
            if id(arg) in record:
                record[id(arg)].append((i, j))
            else:
                record[id(arg)] = [arg, (i, j)]

        for k, v in call[2].items():
            if id(v) in record:
                record[id(v)].append((i, k))
            else:
                record[id(v)] = [v, (i, k)]

    # If we found multiple occurrences of one object, then
    # store it in shared memory, pass a proxy as a value
    calls = [list(item) for item in calls]

    storage = Storage()
    thread_pool = ThreadPoolExecutor(max_workers=len(record))

    def put_arg_obj(positions):
        obj = positions.pop(0)
        if len(positions) > 1 and consider_sharing(obj):
            logger.debug('Proxying {}'.format(type(obj)))
            obj_bin = pickle.dumps(obj)
            cloud_object = storage.put_cloudobject(obj_bin)

            for pos in positions:
                call_n, idx_or_key = pos
                call = calls[call_n]

                if isinstance(idx_or_key, str):
                    call[2][idx_or_key] = cloud_object
                else:
                    args_as_list = list(call[1])
                    args_as_list[idx_or_key] = cloud_object
                    call[1] = tuple(args_as_list)

                try:
                    call[3].append(idx_or_key)
                except IndexError:
                    call.append([idx_or_key])

    fut = []
    for positions in record.values():
        f = thread_pool.submit(put_arg_obj, positions)
        fut.append(f)
    [f.result() for f in fut]

    return [tuple(item) for item in calls]
Пример #11
0
    def get_arg_obj(idx_or_key):
        if isinstance(idx_or_key, str):
            obj_id = kwargs[idx_or_key]
        else:
            obj_id = args_as_list[idx_or_key]

        if obj_id in cache:
            logger.debug('Get {} (arg {}) from cache'.format(
                obj_id, idx_or_key))
            obj = cache[obj_id]
        else:
            logger.debug('Get {} (arg {}) from storage'.format(
                obj_id, idx_or_key))
            storage = Storage()
            obj_bin = storage.get_cloudobject(obj_id)
            obj = pickle.loads(obj_bin)
            cache[obj_id] = obj

        if isinstance(idx_or_key, str):
            kwargs[idx_or_key] = obj
        else:
            args_as_list[idx_or_key] = obj
Пример #12
0
    def __init__(
        self,
        executor: Executor,
        ds: Dataset,
        perf: Profiler,
        sm_config: Optional[Dict] = None,
        use_cache=False,
        store_images=True,
    ):
        """
        Args
        ========

        use_cache: For development - cache the results after each pipeline step so that it's easier
                   to quickly re-run specific steps.
        """
        sm_config = sm_config or SMConfig.get_conf()
        self.sm_storage = sm_config['lithops']['sm_storage']
        self.storage = Storage(sm_config['lithops'])
        self.s3_client = get_s3_client()
        self.ds = ds
        self.perf = perf
        self.store_images = store_images
        self.db = DB()
        self.es = ESExporter(self.db, sm_config)
        self.imzml_cobj, self.ibd_cobj = _upload_imzmls_from_prefix_if_needed(
            self.ds.input_path, self.storage, self.sm_storage, self.s3_client)
        self.moldb_defs = _upload_moldbs_from_db(
            self.ds.config['database_ids'], self.storage, self.sm_storage)

        if use_cache:
            cache_key: Optional[str] = jsonhash({
                'input_path': ds.input_path,
                'ds': ds.config
            })
        else:
            cache_key = None

        self.pipe = Pipeline(
            self.imzml_cobj,
            self.ibd_cobj,
            self.moldb_defs,
            self.ds.config,
            cache_key=cache_key,
            executor=executor,
        )

        self.results_dfs = None
        self.png_cobjs = None
        self.db_formula_image_ids = None
Пример #13
0
    def _fill_optional_args(self, function, data):
        """
        Fills in those reserved, optional parameters that might be write to the function signature
        """
        func_sig = inspect.signature(function)

        if len(data) == 1 and 'future' in data:
            # Function chaining feature
            out = [
                data.pop('future').result(
                    internal_storage=self.internal_storage)
            ]
            data.update(verify_args(function, out, None)[0])

        if 'ibm_cos' in func_sig.parameters:
            if 'ibm_cos' in self.lithops_config:
                if self.internal_storage.backend == 'ibm_cos':
                    ibm_boto3_client = self.internal_storage.get_client()
                else:
                    ibm_boto3_client = Storage(config=self.lithops_config,
                                               backend='ibm_cos').get_client()
                data['ibm_cos'] = ibm_boto3_client
            else:
                raise Exception(
                    'Cannot create the ibm_cos client: missing configuration')

        if 'storage' in func_sig.parameters:
            data['storage'] = self.internal_storage.storage

        if 'rabbitmq' in func_sig.parameters:
            if 'rabbitmq' in self.lithops_config:
                rabbit_amqp_url = self.lithops_config['rabbitmq'].get(
                    'amqp_url')
                params = pika.URLParameters(rabbit_amqp_url)
                connection = pika.BlockingConnection(params)
                data['rabbitmq'] = connection
            else:
                raise Exception(
                    'Cannot create the rabbitmq client: missing configuration')

        if 'id' in func_sig.parameters:
            data['id'] = int(self.job.call_id)
Пример #14
0
def clean_executor_jobs(executor_id, executor_data):

    storage = None
    prefix = '/'.join([JOBS_PREFIX, executor_id])

    for file_data in executor_data:
        file_location = file_data['file_location']
        data = file_data['data']

        storage_config = data['storage_config']
        clean_cloudobjects = data['clean_cloudobjects']
        if not storage:
            storage = Storage(storage_config=storage_config)

        logger.info(
            f'Cleaning jobs {", ".join([job_key for job_key in data["jobs_to_clean"]])}'
        )

        objects = storage.list_keys(storage.bucket, prefix)

        objects_to_delete = [
            key for key in objects if '-'.join(
                key.split('/')[1].split('-')[0:3]) in data['jobs_to_clean']
        ]

        while objects_to_delete:
            storage.delete_objects(storage.bucket, objects_to_delete)
            time.sleep(5)
            objects = storage.list_keys(storage.bucket, prefix)
            objects_to_delete = [
                key for key in objects if '-'.join(
                    key.split('/')[1].split('-')[0:3]) in data['jobs_to_clean']
            ]

        if clean_cloudobjects:
            for job_key in data['jobs_to_clean']:
                prefix = '/'.join([TEMP_PREFIX, job_key])
                clean_bucket(storage, storage.bucket, prefix)

        if os.path.exists(file_location):
            os.remove(file_location)
        logger.info('Finished')
Пример #15
0
    def clean_file(file_name):
        file_location = os.path.join(CLEANER_DIR, file_name)

        if file_location in [CLEANER_LOG_FILE, CLEANER_PID_FILE]:
            return

        with open(file_location, 'rb') as pk:
            data = pickle.load(pk)

        if 'jobs_to_clean' in data:
            jobs_to_clean = data['jobs_to_clean']
            storage_config = data['storage_config']
            clean_cloudobjects = data['clean_cloudobjects']
            storage = Storage(storage_config=storage_config)

            for job_key in jobs_to_clean:
                logger.info('Going to clean: {}'.format(job_key))

                prefix = '/'.join([JOBS_PREFIX, job_key])
                clean_bucket(storage, storage.bucket, prefix)

                if clean_cloudobjects:
                    prefix = '/'.join([TEMP_PREFIX, job_key])
                    clean_bucket(storage, storage.bucket, prefix)

        if 'cos_to_clean' in data:
            logger.info('Going to clean cloudobjects')
            cos_to_clean = data['cos_to_clean']
            storage_config = data['storage_config']
            storage = Storage(storage_config=storage_config)

            for co in cos_to_clean:
                if co.backend == storage.backend:
                    logging.info('Cleaning {}://{}/{}'.format(
                        co.backend, co.bucket, co.key))
                    storage.delete_object(co.bucket, co.key)

        if os.path.exists(file_location):
            os.remove(file_location)
Пример #16
0
def load_cobj(storage: Storage, cobj):
    try:
        return deserialize(storage.get_cloudobject(cobj))
    except Exception:
        logger.error(f'Failed to deserialize {cobj}')
        raise
Пример #17
0
def _split_objects_from_object_storage(map_func_args_list, chunk_size,
                                       chunk_number, internal_storage, config):
    """
    Create partitions from a list of buckets or object keys
    """
    if chunk_number:
        logger.debug('Chunk size set to {}'.format(chunk_size))
    elif chunk_size:
        logger.debug('Chunk number set to {}'.format(chunk_number))
    else:
        logger.debug('Chunk size and chunk number not set ')

    sbs = set()
    buckets = set()
    prefixes = set()
    obj_names = set()

    for elem in map_func_args_list:
        if type(elem['obj']) == CloudObject:
            elem['obj'] = '{}://{}/{}'.format(elem['obj'].backend,
                                              elem['obj'].bucket,
                                              elem['obj'].key)
        sb, bucket, prefix, obj_name = utils.split_object_url(elem['obj'])
        if sb is None:
            sb = internal_storage.backend
            elem['obj'] = '{}://{}'.format(sb, elem['obj'])
        if obj_name:
            obj_names.add((bucket, prefix))
        elif prefix:
            prefixes.add((bucket, prefix))
        else:
            buckets.add(bucket)
        sbs.add(sb)

    if len(sbs) > 1:
        raise Exception(
            'Process objects from multiple storage backends is not supported. '
            'Current storage backends: {}'.format(sbs))
    sb = sbs.pop()
    if sb == internal_storage.backend:
        storage = internal_storage.storage
    else:
        storage = Storage(config=config, backend=sb)

    objects = {}

    if obj_names:
        for bucket, prefix in obj_names:
            logger.debug("Listing objects in '{}://{}'".format(
                sb, '/'.join([bucket, prefix])))
            if bucket not in objects:
                objects[bucket] = []
            prefix = prefix + '/' if prefix else prefix
            objects[bucket].extend(storage.list_objects(bucket, prefix))
        logger.debug("Total objects found: {}".format(len(objects[bucket])))

    elif prefixes:
        for bucket, prefix in prefixes:
            logger.debug("Listing objects in '{}://{}'".format(
                sb, '/'.join([bucket, prefix])))
            if bucket not in objects:
                objects[bucket] = []
            prefix = prefix + '/' if prefix else prefix
            objects[bucket].extend(storage.list_objects(bucket, prefix))
        logger.debug("Total objects found: {}".format(len(objects[bucket])))

    elif buckets:
        for bucket in buckets:
            logger.debug("Listing objects in '{}://{}'".format(sb, bucket))
            objects[bucket] = storage.list_objects(bucket)
        logger.debug("Total objects found: {}".format(len(objects[bucket])))

    if all([len(objects[bucket]) == 0 for bucket in objects]):
        raise Exception(
            f'No objects found in bucket: {", ".join(objects.keys())}')

    keys_dict = {}
    for bucket in objects:
        keys_dict[bucket] = {}
        for obj in objects[bucket]:
            keys_dict[bucket][obj['Key']] = obj['Size']

    partitions = []
    parts_per_object = []

    def create_partition(bucket, key, entry):

        if key.endswith('/'):
            logger.debug(
                f'Discarding object "{key}" as it is a prefix folder (0.0B)')
            return

        obj_size = keys_dict[bucket][key]

        if chunk_number:
            chunk_rest = obj_size % chunk_number
            obj_chunk_size = (obj_size // chunk_number) + \
                round((chunk_rest / chunk_number) + 0.5)
        elif chunk_size:
            obj_chunk_size = chunk_size
        else:
            obj_chunk_size = obj_size

        size = total_partitions = 0

        ci = obj_size
        cz = obj_chunk_size
        parts = ci // cz + (ci % cz > 0)
        logger.debug('Creating {} partitions from object {} ({})'.format(
            parts, key, sizeof_fmt(obj_size)))

        while size < obj_size:
            brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD)
            brange = None if obj_size == obj_chunk_size else brange

            partition = entry.copy()
            partition['obj'] = CloudObject(sb, bucket, key)
            partition['obj'].data_byte_range = brange
            partition['obj'].chunk_size = obj_chunk_size
            partition['obj'].part = total_partitions
            partitions.append(partition)

            total_partitions += 1
            size += obj_chunk_size

        parts_per_object.append(total_partitions)

    for entry in map_func_args_list:
        sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj'])

        if obj_name:
            # each entry is an object key
            key = '/'.join([prefix, obj_name]) if prefix else obj_name
            create_partition(bucket, key, entry)

        else:
            # each entry is a bucket
            for key in keys_dict[bucket]:
                create_partition(bucket, key, entry)

    return partitions, parts_per_object
Пример #18
0
    def _load_object(self, data):
        """
        Loads the object in /tmp in case of object processing
        """
        extra_get_args = {}

        obj = data['obj']

        if hasattr(obj, 'bucket') and not hasattr(obj, 'path'):
            logger.info('Getting dataset from {}://{}/{}'.format(
                obj.backend, obj.bucket, obj.key))

            if obj.backend == self.internal_storage.backend:
                storage = self.internal_storage.storage
            else:
                storage = Storage(config=self.lithops_config,
                                  backend=obj.backend)

            if obj.data_byte_range is not None:
                extra_get_args['Range'] = 'bytes={}-{}'.format(
                    *obj.data_byte_range)
                logger.info('Chunk: {} - Range: {}'.format(
                    obj.part, extra_get_args['Range']))
                sb = storage.get_object(obj.bucket,
                                        obj.key,
                                        stream=True,
                                        extra_get_args=extra_get_args)
                wsb = WrappedStreamingBodyPartition(sb, obj.chunk_size,
                                                    obj.data_byte_range)
                obj.data_stream = wsb
            else:
                sb = storage.get_object(obj.bucket,
                                        obj.key,
                                        stream=True,
                                        extra_get_args=extra_get_args)
                obj.data_stream = sb

        elif hasattr(obj, 'url'):
            logger.info('Getting dataset from {}'.format(obj.url))
            if obj.data_byte_range is not None:
                range_str = 'bytes={}-{}'.format(*obj.data_byte_range)
                extra_get_args['Range'] = range_str
                logger.info('Chunk: {} - Range: {}'.format(
                    obj.part, extra_get_args['Range']))
            resp = requests.get(obj.url, headers=extra_get_args, stream=True)
            obj.data_stream = resp.raw

        elif hasattr(obj, 'path'):
            logger.info('Getting dataset from {}'.format(obj.path))
            with open(obj.path, "rb") as f:
                if obj.data_byte_range is not None:
                    extra_get_args['Range'] = 'bytes={}-{}'.format(
                        *obj.data_byte_range)
                    logger.info('Chunk: {} - Range: {}'.format(
                        obj.part, extra_get_args['Range']))
                    first_byte, last_byte = obj.data_byte_range
                    f.seek(first_byte)
                    buffer = io.BytesIO(f.read(last_byte - first_byte + 1))
                    sb = WrappedStreamingBodyPartition(buffer, obj.chunk_size,
                                                       obj.data_byte_range)
                else:
                    sb = io.BytesIO(f.read())
            obj.data_stream = sb
Пример #19
0
def create_partitions(config, internal_storage, map_iterdata, chunk_size, chunk_number):
    """
    Method that returns the function that will create the partitions of the objects in the Cloud
    """
    logger.debug('Starting partitioner')

    parts_per_object = None

    sbs = set()
    buckets = set()
    prefixes = set()
    obj_names = set()
    urls = set()

    logger.debug("Parsing input data")
    for elem in map_iterdata:
        if 'url' in elem:
            urls.add(elem['url'])
        elif 'obj' in elem:
            if type(elem['obj']) == CloudObject:
                elem['obj'] = '{}://{}/{}'.format(elem['obj'].backend,
                                                  elem['obj'].bucket,
                                                  elem['obj'].key)
            sb, bucket, prefix, obj_name = utils.split_object_url(elem['obj'])
            if sb is None:
                sb = internal_storage.backend
                elem['obj'] = '{}://{}'.format(sb, elem['obj'])
            if obj_name:
                obj_names.add((bucket, prefix))
            elif prefix:
                prefixes.add((bucket, prefix))
            else:
                buckets.add(bucket)
            sbs.add(sb)

    if len(sbs) > 1:
        raise Exception('Currently we only support to process one storage backend at a time. '
                        'Current storage backends: {}'.format(sbs))

    if [prefixes, obj_names, urls, buckets].count(True) > 1:
        raise Exception('You must provide as an input data a list of bucktes, '
                        'a list of buckets with object prefix, a list of keys '
                        'or a list of urls. Intermingled types are not allowed.')

    if not urls:
        # process objects from an object store. No url
        sb = sbs.pop()
        if sb == internal_storage.backend:
            storage = internal_storage.storage
        else:
            storage = Storage(config=config, backend=sb)
        objects = {}
        if obj_names:
            for bucket, prefix in obj_names:
                logger.debug("Listing objects in '{}://{}/'"
                             .format(sb, '/'.join([bucket, prefix])))
                if bucket not in objects:
                    objects[bucket] = []
                prefix = prefix + '/' if prefix else prefix
                objects[bucket].extend(storage.list_objects(bucket, prefix))
        elif prefixes:
            for bucket, prefix in prefixes:
                logger.debug("Listing objects in '{}://{}/'"
                             .format(sb, '/'.join([bucket, prefix])))
                if bucket not in objects:
                    objects[bucket] = []
                prefix = prefix + '/' if prefix else prefix
                objects[bucket].extend(storage.list_objects(bucket, prefix))
        elif buckets:
            for bucket in buckets:
                logger.debug("Listing objects in '{}://{}'".format(sb, bucket))
                objects[bucket] = storage.list_objects(bucket)

        keys_dict = {}
        for bucket in objects:
            keys_dict[bucket] = {}
            for obj in objects[bucket]:
                keys_dict[bucket][obj['Key']] = obj['Size']

    if buckets or prefixes:
        partitions, parts_per_object = _split_objects_from_buckets(map_iterdata, keys_dict, chunk_size, chunk_number)

    elif obj_names:
        partitions, parts_per_object = _split_objects_from_keys(map_iterdata, keys_dict, chunk_size, chunk_number)

    elif urls:
        partitions, parts_per_object = _split_objects_from_urls(map_iterdata, chunk_size, chunk_number)

    else:
        raise ValueError('You did not provide any bucket or object key/url')

    return partitions, parts_per_object
Пример #20
0
def save_cobj(storage: Storage, obj: TItem, bucket: str = None, key: str = None) -> CObj[TItem]:
    return storage.put_cloudobject(serialize(obj), bucket, key)
Пример #21
0
def delete_objects_by_prefix(storage: Storage, bucket: str, prefix: str):
    keys = storage.list_keys(bucket, prefix)
    storage.delete_objects(bucket, keys)
    logger.info(f'Removed {len(keys)} objects from {storage.backend}://{bucket}/{prefix}')