예제 #1
0
def update_size_if_different(sf):
    real_size = os.stat(sf.path_staged()).st_size
    size_ratio_allowed = 1
    size_diff = sf.size - real_size
    size_ratio = sf.size / float(real_size)
    if size_ratio > size_ratio_allowed:
        logger.warning('reported file size %d and actual file size %d '
                       'differs with more than %.1f%%' % \
                       (sf.size, real_size, 100 * size_ratio - 100))
    if sf.size != real_size:
        logger.info('updating db file size %d -> %d, (diff = %d bytes, '
                    'ratio = %f)' % \
                    (sf.size, real_size, size_diff, size_ratio))
        sf.size = real_size
예제 #2
0
def stage_file(self, file_name, target_dir, path_out):
    try:
        end_target = os.path.join(target_dir, file_name)
        tmp_target = os.path.join(target_dir, uuid.uuid4().get_hex())
        logger.info('staging %s' % end_target)
        logger.debug('tmp_target: %s' % tmp_target)
        mars_request = create_mars_request(verb='RETRIEVE',
                                           file_name=file_name,
                                           target=tmp_target)
        logger.debug('mars_request: %s' % mars_request)

        with open(path_out, 'w') as f:
            for rc,fd,l in util.exec_proc([ 'mars' ], logger, stdin=str(mars_request)):
                if fd is not None and l is not None:
                    if fd == 1:
                        logger.debug('fd=%s, l=%s' % (fd, l.strip() if l else l))
                    else:
                        logger.warning('fd=%s, l=%s' % (fd, l.strip() if l else l))
                    f.write(l)
                    f.flush()
            f.flush()
            os.fdatasync(f.fileno())
            f.close()
        if rc != 0:
            logger.error('failed to stage %s, rc = %d' % (file_name, rc))
            logger.error('MARS request was: %s' % mars_request)
            logger.debug('removing temp file %s' % tmp_target)
            util.unlink(tmp_target) # FIXME: use try...finally
            raise TaskFailure('mars returned %d' % rc)

        logger.debug('moving temp file %s -> %s' % (tmp_target, end_target))
        os.rename(tmp_target, end_target)
        logger.info('%s is staged online' % end_target)
        logger.debug('=> invoking scheduler')
        tasks.scheduler.join_staging_task.delay(stage_file.request.id)
    except Exception, e:
        logger.error('stage_file(file_name=%s, target_dir=%s, path_out=%s) '
                     'unexpectedly failed (task id %s): %s, retrying in '
                     '60 s...' %
                     (file_name, target_dir, path_out, stage_file.request.id,
                      str(e)))
        raise self.retry(exc=e, countdown=60)
예제 #3
0
def handle_staging_result(staging_task, sf):
    with allow_join_result():
        logger.debug('getting staging result')
        try:
            staging_task.get() # returns None, but more importantly propagates any exception in the task
        except Exception, e:
            logger.warning('%s failed: %s' % (staging_task, e))
            # consider resubmitting the task a limited amount since it
            # should only fail in rare cases (if so then do this by
            # raising a RetryException or similar here)
            logger.debug('deregistering %s from %s' % (sf.staging_task, sf))
            tasks.session.delete(staging_task)
            sf.staging_task = None
            assert sf.staging_task is None, sf
            logger.debug('db commit')
            tasks.session.commit()
            for r in dispatched_requests(sf):
                logger.info('request %s failed' % r.uuid)
                fail_request(r, 'Staging of %s failed: %s' % (sf, str(e)))
            return # we deliberately only handle exceptions from the staging task, any other exception will propagate
예제 #4
0
def join_sizing_tasks(files):
    with allow_join_result():
        for sf in files:
            try:
                assert sf.size is None, sf
                logger.debug('<= awaiting size estimation result for %s, '
                             'task is %s' % (sf, sf.sizing_task))
                if sf.sizing_task is None:
                    # this means it failed before and the task got
                    # unregistered from sf
                    raise TaskFailure('size estimation has failed - a new task '
                                      'will be created on the next request '
                                      'containing this file')
                size = sf.sizing_task.get()
                logger.debug('size estimated to %d bytes' % size)
                assert size >= 0, size
                sf.size = size
            except Exception, e:
                logger.warning('size estimation task %s failed: %s' % \
                               (sf.sizing_task, e))
                raise e
            finally:
예제 #5
0
def schedule_tasks(self):
    try:
        dispatchable_requests = get_dispatchable_requests()
        logger.info('running scheduler - %d dispatchable request(s): %s' % \
                    ( dispatchable_requests.count(),
                      ', '.join(r.uuid for r in dispatchable_requests)))
        reserved_files = get_reserved_files()
        online_files = get_online_files()
        reserved_space = sum(sf.size for sf in reserved_files)
        used_space = sum(sf.size for sf in online_files)
        available_space = config.STAGE_SPACE - used_space
        purgable_files = get_purgable_files()
        purgable_amount = sum(sf.size for sf in purgable_files)

        assert len(reserved_files.all()) == len(set(reserved_files)), \
            reserved_files.all()
        assert used_space >= 0, used_space
        assert reserved_space >= 0, (reserved_files, reserved_space)
        logger.debug('reserved files: %s' % ', '.join(sf.name for sf in reserved_files))
        logger.debug('online files: %s' % ', '.join(sf.name for sf in online_files))
        logger.debug('purgable amount: %d bytes from %s' % \
                     (purgable_amount, ', '.join(sf.name for sf in purgable_files)))
        logger.info('total staging space: %d bytes, used: %d bytes, reserved: %d '
                    'bytes. Max %d bytes available for new requests (%d bytes '
                    'purgable)' % \
                    (config.STAGE_SPACE, used_space, reserved_space,
                     config.STAGE_SPACE - reserved_space, purgable_amount))
        dispatch_tasks = True
        num_tasks_dispatched = 0
        num_tasks_failed = 0
        num_tasks_deferred = 0
        consumed_space = 0

        for rs in dispatchable_requests:
            try:
                files_offline_not_being_staged = get_files_offline_not_being_staged(rs)
                logger.info('scheduling %s, available space %d bytes, offline files: %s' % \
                            (rs.uuid, available_space,
                             ', '.join(sf.name for sf in files_offline_not_being_staged)))
                logger.debug('offline files: %s' % \
                             files_offline_not_being_staged.all())
                logger.info('waiting for all size estimation tasks...')
                join_sizing_tasks(files_offline_not_being_staged.\
                                      filter(StagableFile.size == None))
            except Exception, e:
                # Note: all requests will keep on failing until this task has been resubmitted
                logger.warning('sizing estimation of %s failed: %s, request -> '
                               'failed' % (rs.uuid, e))
                fail_request(rs, 'sizing estimation of %s failed: %s' % (rs.uuid, e))
                num_tasks_failed += 1
                continue

            assert all(f.size is not None for f in files_offline_not_being_staged),\
                files_offline_not_being_staged
            offline_size = sum(f.size for f in files_offline_not_being_staged)
            total_size = sum(f.size for f in rs.files)
            logger.info('total request size: %d bytes, %d offline bytes' % \
                        (total_size, offline_size))
            # if staging fails any later request will we must make sure
            # either to fail any later requests for that file

            # fastforward all dispatchable zero cost requests and any
            # requests that is impossible to fulfill:
            #    created -> failed,
            #    created -> dispatched,
            #    created -> finished:
            if total_size > config.STAGE_SPACE:
                logger.info('fast forwarding %s -> failed - since there is no way '
                             'it can be fulfilled (needs %d of %d bytes '
                             'available)' % \
                             (rs.uuid, total_size, config.STAGE_SPACE))
                fail_request(rs, '%s can not be fulfilled (needs %d of %d bytes '
                             'available)' % (rs.uuid, total_size, config.STAGE_SPACE))
                num_tasks_failed += 1
            elif not files_offline_not_being_staged.first():
                if get_files_offline_being_staged(rs).first():
                    logger.info('fast forwarding %s -> dispatched - since it '
                                 'fully overlaps with staging tasks in '
                                 'progress' % rs.uuid)
                    rs.state = 'dispatched'
                    num_tasks_dispatched += 1
                else:
                    # this should not fail unless db is being concurrently
                    # updated:
                    assert all(x.state == 'online' for x in rs.files), rs
                    logger.info('fast forwarding %s -> finished - since all '
                                'files are online' % rs.uuid)
                    finish_request(rs)
                    num_tasks_dispatched += 1
            elif dispatch_tasks:
                if offline_size > available_space:
                    logger.info('%s requires more space than what is available '
                                '(%d > %d), initiating cache purging' % \
                                ( rs.uuid, offline_size, available_space ))
                    required_amount = offline_size - available_space
                    bytes_freed, requests_to_delete = purge_files(required_amount,
                                                                  purgable_files)
                    available_space += bytes_freed
                    logger.info('freed up %d/%d bytes (%d%%) - available space %d '
                                'bytes' % \
                                ( bytes_freed, required_amount,
                                  int(round(bytes_freed / float(offline_size) * 100, 3)),
                                  available_space ))
                    if requests_to_delete:
                        assert all(x.is_deletable for x in requests_to_delete), requests_to_delete
                        logger.info('deleting all deletable requests from db '
                                    'belonging to files that have been '
                                    'purged: %s' % \
                                    ', '.join(x.uuid for x in requests_to_delete))
                        tasks.session.query(DownloadRequest).\
                            filter(DownloadRequest.uuid.in_(x.uuid for x in requests_to_delete)).\
                            delete(synchronize_session='fetch')
                    if offline_size > available_space:
                        logger.info('%d bytes needed but only %d bytes available -'
                                    ' not considering any further requests for task'
                                    ' dispatching until the next request has '
                                    'finished/failed' % \
                                    (offline_size, available_space))
                        dispatch_tasks = False
                        # schedule_tasks will be triggered upon next
                        # request failure or deletion after finish
                        num_tasks_deferred += 1

                # optimisation possibility: we can also dispatch as many
                # tasks as there currently is room for even if the request
                # can not be fully staged yet (but that would also
                # complicate how we compute reserved/available_space)
                if dispatch_tasks:
                    logger.info('staging offline files for request %s: %s' % \
                                (rs.uuid,
                                 ', '.join(sf.name for sf in files_offline_not_being_staged)))
                    for sf in files_offline_not_being_staged:
                        path_out = util.create_tempfile()
                        # we'd like to chain the join_staging_tasks here
                        # rather than calling it from the stage_file task,
                        # but then we cannot register the async result
                        # like this...
                        async_result = tasks.worker.stage_file.delay(sf.name,
                                                                     sf.path,
                                                                     path_out)
                        # note the critical window if worker dies here the
                        # task may have started but will not be registered
                        # with f (the same goes for submitting sizing
                        # tasks). We choose risking more than one
                        # identical staging task running (which is
                        # harmless and has a low probability, though
                        # wastes some resources) rather than risking
                        # having the task registered with the file but not
                        # actually running (which is harder to handle)
                        #
                        # (possibly create a celery task group consisting
                        # of staging -> checksumming):
                        sf.staging_task = Task(async_result.id, path_out)
                        # IMPORTANT: commit immediately since stage_file
                        # calls join_staging_tasks which looks up the task
                        # in the db:
                        tasks.session.commit()
                        logger.info('=> staging task for %s is %s' % \
                                    (sf.name, sf.staging_task.uuid))
                    consumed_space += offline_size
                    available_space -= offline_size
                    rs.state = 'dispatched'
                    num_tasks_dispatched += 1
            else:
                logger.info('request %s is not eliglible for dispatching since it '
                            'can not be fast forwarded and at least one higher '
                            'priority request is still waiting to be dispatched' % \
                            rs.uuid)
                num_tasks_deferred += 1

        logger.info('scheduling iteration completed, %d tasks dispatched, %d '
                    'tasks failed, %d tasks deferred. Consumed space %d bytes, '
                    'total space available %d bytes' % \
                    (num_tasks_dispatched, num_tasks_failed, num_tasks_deferred,
                     consumed_space, available_space))
        tasks.session.commit()