def purge_files(min_size, purgable_files): # we try to keep the cache as full as possible and defer deleting # any files until we can fullfil an allocation request completely # - since no request will start opportunisticly anyway. Also only # delete enough to ensure min_size bytes. bytes_freed = 0 bytes_freeable = sum(sf.size for sf in purgable_files) requests_to_delete = set() if min_size <= bytes_freeable: for sf in purgable_files: if bytes_freed >= min_size: break logger.debug('deleting %s' % sf) # raises OSError on any error except if not existing: if sf.staging_task: util.unlink(sf.staging_task.path_out) is_deleted = util.unlink(sf.path_staged()) if not is_deleted: logger.warn('tried purging staged file %s - but it is already ' 'gone' % sf) sf.state = 'offline' bytes_freed += sf.size requests_to_delete.update(set(sf.requests)) return bytes_freed, requests_to_delete
def stage_file(self, file_name, target_dir, path_out): try: end_target = os.path.join(target_dir, file_name) tmp_target = os.path.join(target_dir, uuid.uuid4().get_hex()) logger.info('staging %s' % end_target) logger.debug('tmp_target: %s' % tmp_target) mars_request = create_mars_request(verb='RETRIEVE', file_name=file_name, target=tmp_target) logger.debug('mars_request: %s' % mars_request) with open(path_out, 'w') as f: for rc,fd,l in util.exec_proc([ 'mars' ], logger, stdin=str(mars_request)): if fd is not None and l is not None: if fd == 1: logger.debug('fd=%s, l=%s' % (fd, l.strip() if l else l)) else: logger.warning('fd=%s, l=%s' % (fd, l.strip() if l else l)) f.write(l) f.flush() f.flush() os.fdatasync(f.fileno()) f.close() if rc != 0: logger.error('failed to stage %s, rc = %d' % (file_name, rc)) logger.error('MARS request was: %s' % mars_request) logger.debug('removing temp file %s' % tmp_target) util.unlink(tmp_target) # FIXME: use try...finally raise TaskFailure('mars returned %d' % rc) logger.debug('moving temp file %s -> %s' % (tmp_target, end_target)) os.rename(tmp_target, end_target) logger.info('%s is staged online' % end_target) logger.debug('=> invoking scheduler') tasks.scheduler.join_staging_task.delay(stage_file.request.id) except Exception, e: logger.error('stage_file(file_name=%s, target_dir=%s, path_out=%s) ' 'unexpectedly failed (task id %s): %s, retrying in ' '60 s...' % (file_name, target_dir, path_out, stage_file.request.id, str(e))) raise self.retry(exc=e, countdown=60)
sf.staging_task = None assert sf.staging_task is None, sf logger.debug('db commit') tasks.session.commit() for r in dispatched_requests(sf): logger.info('request %s failed' % r.uuid) fail_request(r, 'Staging of %s failed: %s' % (sf, str(e))) return # we deliberately only handle exceptions from the staging task, any other exception will propagate logger.debug('updating file size if necessary') update_size_if_different(sf) logger.info('%s is online' % sf.name) sf.state = 'online' logger.debug('deregistering %s from %s' % (sf.staging_task, sf)) tasks.session.delete(sf.staging_task) util.unlink(sf.staging_task.path_out) sf.staging_task = None assert sf.staging_task is None, sf logger.debug('db commit') tasks.session.commit() # verify that we really need this - but it's important that the query below includes r for r in finishable_requests(sf): logger.info('request %s finished' % r.uuid) finish_request(r) @tasks.cel.task(bind=True, acks_late=True, ignore_results=True) def join_staging_task(self, task_id): try: staging_task = get_task(task_id) logger.debug('staging task for %s is: %s' % (task_id, staging_task)) # BUG: FIXME: why is this a list and not a file, we do use uselist=False