def _patch_file(self, download=True): """ Downloads a file from the flatiron, modify it locally, patch it and download it again """ dataset_id = '04abb580-e14b-4716-9ff2-f7b95740b99f' dataset = self.one.alyx.rest('datasets', 'read', id=dataset_id) # download local_file_path = self.one.load(dataset['session'], dataset_types=dataset['dataset_type'], download_only=True, clobber=True)[0] # change it np.save(local_file_path, ~np.load(local_file_path)) new_check_sum = hashfile.md5(local_file_path) # try once with dry self.patcher.patch_dataset(local_file_path, dset_id=dataset['url'][-36:], dry=True) self.patcher.patch_dataset(local_file_path, dset_id=dataset['url'][-36:], dry=False) # the dataset hash should have been updated dataset = self.one.alyx.rest('datasets', 'read', id=dataset_id) self.assertEqual(uuid.UUID(dataset['hash']), uuid.UUID(new_check_sum)) self.assertEqual(dataset['version'], version.ibllib()) if download: # download again and check the hash local_file_path.unlink() local_file_path = self.one.load(dataset['session'], dataset_types=dataset['dataset_type'], download_only=True, clobber=True)[0] self.assertEqual(hashfile.md5(local_file_path), new_check_sum)
def register_dataset(self, file_list, created_by='root', server_repository=None, dry=False): """ Registers a set of files belonging to a session only on the server :param session_path: :param filenames: :param created_by: :param server_repository: :param dry: :return: """ if not isinstance(file_list, list): file_list = [Path(file_list)] assert len(set([alf.io.get_session_path(f) for f in file_list])) == 1 assert all([Path(f).exists() for f in file_list]) session_path = alf.io.get_session_path(file_list[0]) # first register the file r = {'created_by': created_by, 'path': str(session_path.relative_to((session_path.parents[2]))), 'filenames': [str(p.relative_to(session_path)) for p in file_list], 'name': server_repository, 'server_only': True, 'hashes': [md5(p) for p in file_list], 'filesizes': [p.stat().st_size for p in file_list], 'versions': [version.ibllib() for _ in file_list]} if not dry: return self.one.alyx.rest('register-file', 'create', data=r) else: print(r)
def test_registration_session(self): behavior_path = self.session_path.joinpath('raw_behavior_data') behavior_path.mkdir() settings_file = behavior_path.joinpath('_iblrig_taskSettings.raw.json') with open(settings_file, 'w') as fid: json.dump(MOCK_SESSION_SETTINGS, fid) rc = registration.RegistrationClient(one=one) rc.register_session(self.session_path) eid = one.search(subjects=SUBJECT, date_range=['2018-04-01', '2018-04-01'])[0] datasets = one.alyx.rest('datasets', 'list', session=eid) for ds in datasets: self.assertTrue(ds['hash'] is not None) self.assertTrue(ds['file_size'] is not None) self.assertTrue(ds['version'] == version.ibllib()) # checks the procedure of the session ses_info = one.alyx.rest('sessions', 'read', id=eid) self.assertTrue(ses_info['procedures'] == ['Ephys recording with acute probe(s)']) one.alyx.rest('sessions', 'delete', id=eid) # re-register the session as behaviour this time MOCK_SESSION_SETTINGS['PYBPOD_PROTOCOL'] = '_iblrig_tasks_trainingChoiceWorld6.3.1' with open(settings_file, 'w') as fid: json.dump(MOCK_SESSION_SETTINGS, fid) rc.register_session(self.session_path) eid = one.search(subjects=SUBJECT, date_range=['2018-04-01', '2018-04-01'])[0] ses_info = one.alyx.rest('sessions', 'read', id=eid) self.assertTrue(ses_info['procedures'] == ['Behavior training/tasks'])
def test_single_registration(self): dataset = one.alyx.rest('register-file', 'create', data=r) ds = one.alyx.rest('datasets', 'read', id=dataset[0]['id']) self.assertEqual(ds['hash'], md5_0) self.assertEqual(ds['file_size'], 1234) self.assertEqual(ds['version'], version.ibllib()) self.assertEqual(len(dataset[0]['file_records']), 2) one.alyx.rest('datasets', 'delete', id=dataset[0]['id'])
def register_dataset(file_list, one=None, created_by=None, repository=None, server_only=False, versions=None, dry=False, max_md5_size=None): """ Registers a set of files belonging to a session only on the server :param file_list: (list of pathlib.Path or pathlib.Path) :param one: optional (oneibl.ONE), current one object, will create an instance if not provided :param created_by: (string) name of user in Alyx (defaults to 'root') :param repository: optional: (string) name of the repository in Alyx :param server_only: optional: (bool) if True only creates on the Flatiron (defaults to False) :param versions: optional (list of strings): versions tags (defaults to ibllib version) :param dry: (bool) False by default :param verbose: (bool) logs :param max_md5_size: (int) maximum file in bytes to compute md5 sum (always compute if Npne) defaults to None :return: """ if created_by is None: created_by = one._par.ALYX_LOGIN if file_list is None or file_list == '' or file_list == []: return elif not isinstance(file_list, list): file_list = [Path(file_list)] assert len(set([alf.io.get_session_path(f) for f in file_list])) == 1 assert all([Path(f).exists() for f in file_list]) if versions is None: versions = version.ibllib() if isinstance(versions, str): versions = [versions for _ in file_list] assert isinstance(versions, list) and len(versions) == len(file_list) # computing the md5 can be very long, so this is an option to skip if the file is bigger # than a certain threshold if max_md5_size: hashes = [hashfile.md5(p) if p.stat().st_size < max_md5_size else None for p in file_list] else: hashes = [hashfile.md5(p) for p in file_list] session_path = alf.io.get_session_path(file_list[0]) # first register the file r = {'created_by': created_by, 'path': session_path.relative_to((session_path.parents[2])).as_posix(), 'filenames': [p.relative_to(session_path).as_posix() for p in file_list], 'name': repository, 'server_only': server_only, 'hashes': hashes, 'filesizes': [p.stat().st_size for p in file_list], 'versions': versions} if not dry: if one is None: one = ONE() response = one.alyx.rest('register-file', 'create', data=r) for p in file_list: _logger.info(f"ALYX REGISTERED DATA: {p}") return response
def register_images(self, widths=None, function=None, extra_dict=None): report_tag = '## report ##' snapshot = Snapshot(one=self.one, object_id=self.object_id, content_type=self.content_type) jsons = [] texts = [] for f in self.outputs: json_dict = dict(tag=report_tag, version=version.ibllib(), function=(function or str(self.__class__).split("'")[1]), name=f.stem) if extra_dict is not None: assert isinstance(extra_dict, dict) json_dict.update(extra_dict) jsons.append(json_dict) texts.append(f"{f.stem}") return snapshot.register_images(self.outputs, jsons=jsons, texts=texts, widths=widths)
def patch_dataset(self, path, dset_id=None, dry=False): """ Uploads a dataset from an arbitrary location to FlatIron. :param path: :param dset_id: :param dry: :return: """ status = self._patch_dataset(path, dset_id=dset_id, dry=dry) if not dry and status == 0: self.one.alyx.rest('datasets', 'partial_update', id=dset_id, data={'hash': md5(path), 'file_size': path.stat().st_size, 'version': version.ibllib()} )
def test_registration_session(self): settings = { 'SESSION_DATE': '2018-04-01', 'SESSION_DATETIME': '2018-04-01T12:48:26.795526', 'PYBPOD_CREATOR': ['test_user', 'f092c2d5-c98a-45a1-be7c-df05f129a93c', 'local'], 'SESSION_NUMBER': '002', 'SUBJECT_NAME': 'clns0730', 'PYBPOD_BOARD': '_iblrig_mainenlab_behavior_1', 'PYBPOD_PROTOCOL': '_iblrig_tasks_ephysChoiceWorld', 'IBLRIG_VERSION_TAG': '5.4.1', 'SUBJECT_WEIGHT': 22, } with tempfile.TemporaryDirectory() as td: # creates the local session session_path = Path(td).joinpath('clns0730', '2018-04-01', '002') alf_path = session_path.joinpath('alf') alf_path.mkdir(parents=True) alf_path.joinpath('spikes.times.npy').touch() alf_path.joinpath('spikes.amps.npy').touch() behavior_path = session_path.joinpath('raw_behavior_data') behavior_path.mkdir() settings_file = behavior_path.joinpath( '_iblrig_taskSettings.raw.json') with open(settings_file, 'w') as fid: json.dump(settings, fid) rc = registration.RegistrationClient(one=one) rc.register_session(session_path) eid = one.search(subjects='clns0730', date_range=['2018-04-01', '2018-04-01'])[0] datasets = one.alyx.get( '/datasets?subject=clns0730&date=2018-04-01') for ds in datasets: self.assertTrue(ds['hash'] is not None) self.assertTrue(ds['file_size'] is not None) self.assertTrue(ds['version'] == version.ibllib()) one.alyx.rest('sessions', 'delete', id=eid)
one = one.ONE(base_url='https://test.alyx.internationalbrainlab.org', username='******', password='******') SUBJECT = 'clns0730' USER = '******' # one = one.ONE(base_url='http://localhost:8000') # SUBJECT = 'CSP013' # USER = '******' md5_0 = 'add2ab27dbf8428f8140-0870d5080c7f' r = {'created_by': 'olivier', 'path': f'{SUBJECT}/2018-08-24/002', 'filenames': ["raw_behavior_data/_iblrig_encoderTrialInfo.raw.ssv"], 'hashes': [md5_0], 'filesizes': [1234], 'versions': [version.ibllib()]} MOCK_SESSION_SETTINGS = { 'SESSION_DATE': '2018-04-01', 'SESSION_DATETIME': '2018-04-01T12:48:26.795526', 'PYBPOD_CREATOR': [USER, 'f092c2d5-c98a-45a1-be7c-df05f129a93c', 'local'], 'SESSION_NUMBER': '002', 'SUBJECT_NAME': SUBJECT, 'PYBPOD_BOARD': '_iblrig_mainenlab_behavior_1', 'PYBPOD_PROTOCOL': '_iblrig_tasks_ephysChoiceWorld', 'IBLRIG_VERSION_TAG': '5.4.1', 'SUBJECT_WEIGHT': 22, }
class Task(abc.ABC): log = "" cpu = 1 gpu = 0 io_charge = 5 # integer percentage priority = 30 # integer percentage, 100 means highest priority ram = 4 # RAM needed to run (Go) one = None # one instance (optional) level = 0 outputs = None time_elapsed_secs = None time_out_secs = None version = version.ibllib() def __init__(self, session_path, parents=None, taskid=None, one=None): self.taskid = taskid self.one = one self.session_path = session_path self.register_kwargs = {} if parents: self.parents = parents else: self.parents = [] @property def name(self): return self.__class__.__name__ def run(self, **kwargs): """ --- do not overload, see _run() below--- wraps the _run() method with - error management - logging to variable """ # if taskid of one properties are not available, local run only without alyx use_alyx = self.one is not None and self.taskid is not None if use_alyx: self.one.alyx.rest('tasks', 'partial_update', id=self.taskid, data={'status': 'Started'}) # setup self.setUp() # Setup the console handler with a StringIO object log_capture_string = io.StringIO() ch = logging.StreamHandler(log_capture_string) str_format = '%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s' ch.setFormatter(logging.Formatter(str_format)) _logger.addHandler(ch) _logger.info(f"Starting job {self.__class__}") # run start_time = time.time() self.status = 0 try: self.outputs = self._run(**kwargs) _logger.info(f"Job {self.__class__} complete") except BaseException: _logger.error(traceback.format_exc()) _logger.info(f"Job {self.__class__} errored") self.status = -1 self.time_elapsed_secs = time.time() - start_time # log the outputs-+ if isinstance(self.outputs, list): nout = len(self.outputs) elif self.outputs is None: nout = 0 else: nout = 1 _logger.info(f"N outputs: {nout}") _logger.info(f"--- {self.time_elapsed_secs} seconds run-time ---") # after the run, capture the log output self.log = log_capture_string.getvalue() log_capture_string.close() _logger.removeHandler(ch) # tear down self.tearDown() return self.status def register_datasets(self, one=None, **kwargs): """ Register output datasets form the task to Alyx :param one: :param jobid: :param kwargs: directly passed to the register_dataset function :return: """ assert one if self.outputs: if isinstance(self.outputs, list): versions = [self.version for _ in self.outputs] else: versions = [self.version] return register_dataset(self.outputs, one=one, versions=versions, **kwargs) def rerun(self): self.run(overwrite=True) @abc.abstractmethod def _run(self, overwrite=False): """ This is the method to implement :param overwrite: (bool) if the output already exists, :return: out_files: files to be registered. Could be a list of files (pathlib.Path), a single file (pathlib.Path) an empty list [] or None. Whithin the pipeline, there is a distinction between a job that returns an empty list and a job that returns None. If the function returns None, the job will be labeled as "empty" status in the database, otherwise, the job has an expected behaviour of not returning any dataset. """ def setUp(self): """ Function to optionally overload to check inputs. :return: """ def tearDown(self): """
desired_statuses = { 'Task00': 'Complete', 'Task01_void': 'Empty', 'Task02_error': 'Errored', 'Task10': 'Complete', 'Task11': 'Held', 'TaskIncomplete': 'Incomplete', 'TaskGpuLock': 'Waiting' } desired_datasets = [ 'spikes.times.npy', 'spikes.amps.npy', 'spikes.clusters.npy' ] desired_versions = { 'spikes.times.npy': 'custom_job00', 'spikes.amps.npy': version.ibllib(), 'spikes.clusters.npy': version.ibllib() } desired_logs = 'Running on machine: testmachine' desired_logs_rerun = { 'Task00': 1, 'Task01_void': 2, 'Task02_error': 1, 'Task10': 1, 'Task11': 1, 'TaskIncomplete': 1, 'TaskGpuLock': 2 } # job to output a single file (pathlib.Path)
def register_session(self, ses_path, file_list=True): """ Register session in Alyx :param ses_path: path to the session :param file_list: bool. Set to False will only create the session and skip registration :return: Status string on error """ if isinstance(ses_path, str): ses_path = Path(ses_path) # read meta data from the rig for the session from the task settings file settings_json_file = list( ses_path.glob( '**/raw_behavior_data/_iblrig_taskSettings.raw*.json')) if not settings_json_file: settings_json_file = list( ses_path.glob('**/_iblrig_taskSettings.raw*.json')) if not settings_json_file: _logger.error( ['could not find _iblrig_taskSettings.raw.json. Abort.']) raise ValueError( f'_iblrig_taskSettings.raw.json not found in {ses_path} Abort.' ) _logger.warning( [f'Settings found in a strange place: {settings_json_file}']) else: settings_json_file = settings_json_file[0] md = _read_settings_json_compatibility_enforced(settings_json_file) # query alyx endpoints for subject, error if not found try: subject = self.one.alyx.rest('subjects?nickname=' + md['SUBJECT_NAME'], 'list', no_cache=True)[0] except IndexError: _logger.error( f"Subject: {md['SUBJECT_NAME']} doesn't exist in Alyx. ABORT.") raise alferr.AlyxSubjectNotFound(md['SUBJECT_NAME']) # look for a session from the same subject, same number on the same day session_id, session = self.one.search(subject=subject['nickname'], date_range=md['SESSION_DATE'], number=md['SESSION_NUMBER'], details=True, query_type='remote') try: user = self.one.alyx.rest('users', 'read', id=md["PYBPOD_CREATOR"][0], no_cache=True) except Exception as e: _logger.error( f"User: {md['PYBPOD_CREATOR'][0]} doesn't exist in Alyx. ABORT" ) raise e username = user['username'] if user else subject['responsible_user'] # load the trials data to get information about session duration and performance ses_data = raw.load_data(ses_path) start_time, end_time = _get_session_times(ses_path, md, ses_data) n_trials, n_correct_trials = _get_session_performance(md, ses_data) # this is the generic relative path: subject/yyyy-mm-dd/NNN gen_rel_path = Path(subject['nickname'], md['SESSION_DATE'], '{0:03d}'.format(int(md['SESSION_NUMBER']))) # if nothing found create a new session in Alyx task_protocol = md['PYBPOD_PROTOCOL'] + md['IBLRIG_VERSION_TAG'] alyx_procedure = _alyx_procedure_from_task(task_protocol) if not session: ses_ = { 'subject': subject['nickname'], 'users': [username], 'location': md['PYBPOD_BOARD'], 'procedures': [] if alyx_procedure is None else [alyx_procedure], 'lab': subject['lab'], # 'project': project['name'], 'type': 'Experiment', 'task_protocol': task_protocol, 'number': md['SESSION_NUMBER'], 'start_time': ibllib.time.date2isostr(start_time), 'end_time': ibllib.time.date2isostr(end_time) if end_time else None, 'n_correct_trials': n_correct_trials, 'n_trials': n_trials, 'json': md, } session = self.one.alyx.rest('sessions', 'create', data=ses_) if md['SUBJECT_WEIGHT']: wei_ = { 'subject': subject['nickname'], 'date_time': ibllib.time.date2isostr(start_time), 'weight': md['SUBJECT_WEIGHT'], 'user': username } self.one.alyx.rest('weighings', 'create', data=wei_) else: # TODO: if session exists and no json partial_upgrade it session = self.one.alyx.rest('sessions', 'read', id=session_id[0], no_cache=True) _logger.info(session['url'] + ' ') # create associated water administration if not found if not session['wateradmin_session_related'] and ses_data: wa_ = { 'subject': subject['nickname'], 'date_time': ibllib.time.date2isostr(end_time), 'water_administered': ses_data[-1]['water_delivered'] / 1000, 'water_type': md.get('REWARD_TYPE') or 'Water', 'user': username, 'session': session['url'][-36:], 'adlib': False } self.one.alyx.rest('water-administrations', 'create', data=wa_) # at this point the session has been created. If create only, exit if not file_list: return session # register all files that match the Alyx patterns, warn user when files are encountered rename_files_compatibility(ses_path, md['IBLRIG_VERSION_TAG']) F = [ ] # empty list whose keys will be relative paths and content filenames md5s = [] file_sizes = [] for fn in _glob_session(ses_path): if fn.suffix in EXCLUDED_EXTENSIONS: _logger.debug('Excluded: ', str(fn)) continue if not _check_filename_for_registration( fn, self.registration_patterns): _logger.warning('No matching dataset type for: ' + str(fn)) continue if fn.suffix not in self.file_extensions: _logger.warning( 'No matching dataformat (ie. file extension) for: ' + str(fn)) continue if not _register_bool(fn.name, file_list): _logger.debug('Not in filelist: ' + str(fn)) continue try: assert (str(gen_rel_path) in str(fn)) except AssertionError as e: strerr = 'ALF folder mismatch: data is in wrong subject/date/number folder. \n' strerr += ' Expected ' + str( gen_rel_path) + ' actual was ' + str(fn) _logger.error(strerr) raise e # extract the relative path of the file rel_path = Path(str(fn)[str(fn).find(str(gen_rel_path)):]) F.append(str(rel_path.relative_to(gen_rel_path).as_posix())) file_sizes.append(fn.stat().st_size) md5s.append( hashfile.md5(fn) if fn.stat().st_size < 1024**3 else None) _logger.info('Registering ' + str(fn)) r_ = { 'created_by': username, 'path': str(gen_rel_path.as_posix()), 'filenames': F, 'hashes': md5s, 'filesizes': file_sizes, 'versions': [version.ibllib() for _ in F] } self.one.alyx.post('/register-file', data=r_) return session
class Task(abc.ABC): log = "" # place holder to keep the log of the task for registratoin cpu = 1 # CPU resource gpu = 0 # GPU resources: as of now, either 0 or 1 io_charge = 5 # integer percentage priority = 30 # integer percentage, 100 means highest priority ram = 4 # RAM needed to run (Go) one = None # one instance (optional) level = 0 # level in the pipeline hierarchy: level 0 means there is no parent task outputs = None # place holder for a list of Path containing output files time_elapsed_secs = None time_out_secs = 3600 * 2 # time-out after which a task is considered dead version = version.ibllib() signature = { 'input_files': [], 'output_files': [] } # list of tuples (filename, collection, required_flag) force = False # whether or not to re-download missing input files on local server if not present def __init__(self, session_path, parents=None, taskid=None, one=None, machine=None, clobber=True, location='server'): """ Base task class :param session_path: session path :param parents: parents :param taskid: alyx task id :param one: one instance :param machine: :param clobber: whether or not to overwrite log on rerun :param location: location where task is run. Options are 'server' (lab local servers'), 'remote' (remote compute node, data required for task downloaded via one), 'AWS' (remote compute node, data required for task downloaded via AWS), or 'SDSC' (SDSC flatiron compute node) # TODO 'Globus' (remote compute node, data required for task downloaded via Globus) """ self.taskid = taskid self.one = one self.session_path = session_path self.register_kwargs = {} if parents: self.parents = parents else: self.parents = [] self.machine = machine self.clobber = clobber self.location = location self.plot_tasks = [ ] # Plotting task/ tasks to create plot outputs during the task @property def name(self): return self.__class__.__name__ def run(self, **kwargs): """ --- do not overload, see _run() below--- wraps the _run() method with - error management - logging to variable - writing a lock file if the GPU is used - labels the status property of the object. The status value is labeled as: 0: Complete -1: Errored -2: Didn't run as a lock was encountered -3: Incomplete """ # if taskid of one properties are not available, local run only without alyx use_alyx = self.one is not None and self.taskid is not None if use_alyx: # check that alyx user is logged in if not self.one.alyx.is_logged_in: self.one.alyx.authenticate() tdict = self.one.alyx.rest('tasks', 'partial_update', id=self.taskid, data={'status': 'Started'}) self.log = ( '' if not tdict['log'] else tdict['log'] + '\n\n=============================RERUN=============================\n' ) # Setup the console handler with a StringIO object logger_level = _logger.level log_capture_string = io.StringIO() ch = logging.StreamHandler(log_capture_string) str_format = '%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s' ch.setFormatter(logging.Formatter(str_format)) _logger.addHandler(ch) _logger.setLevel(logging.INFO) _logger.info(f"Starting job {self.__class__}") if self.machine: _logger.info(f"Running on machine: {self.machine}") _logger.info(f"running ibllib version {version.ibllib()}") # setup start_time = time.time() try: setup = self.setUp(**kwargs) _logger.info(f"Setup value is: {setup}") self.status = 0 if not setup: # case where outputs are present but don't have input files locally to rerun task # label task as complete _, self.outputs = self.assert_expected_outputs() else: # run task if self.gpu >= 1: if not self._creates_lock(): self.status = -2 _logger.info( f"Job {self.__class__} exited as a lock was found") new_log = log_capture_string.getvalue() self.log = new_log if self.clobber else self.log + new_log log_capture_string.close() _logger.removeHandler(ch) return self.status self.outputs = self._run(**kwargs) _logger.info(f"Job {self.__class__} complete") except Exception: _logger.error(traceback.format_exc()) _logger.info(f"Job {self.__class__} errored") self.status = -1 self.time_elapsed_secs = time.time() - start_time # log the outputs if isinstance(self.outputs, list): nout = len(self.outputs) elif self.outputs is None: nout = 0 else: nout = 1 _logger.info(f"N outputs: {nout}") _logger.info(f"--- {self.time_elapsed_secs} seconds run-time ---") # after the run, capture the log output, amend to any existing logs if not overwrite new_log = log_capture_string.getvalue() self.log = new_log if self.clobber else self.log + new_log log_capture_string.close() _logger.removeHandler(ch) _logger.setLevel(logger_level) # tear down self.tearDown() return self.status def register_datasets(self, one=None, **kwargs): """ Register output datasets form the task to Alyx :param one: :param jobid: :param kwargs: directly passed to the register_dataset function :return: """ _ = self.register_images() return self.data_handler.uploadData(self.outputs, self.version, **kwargs) def register_images(self, **kwargs): """ Registers images to alyx database :return: """ if self.one and len(self.plot_tasks) > 0: for plot_task in self.plot_tasks: try: _ = plot_task.register_images(widths=['orig']) except Exception: _logger.error(traceback.format_exc()) continue def rerun(self): self.run(overwrite=True) def get_signatures(self, **kwargs): """ This is the default but should be overwritten for each task :return: """ self.input_files = self.signature['input_files'] self.output_files = self.signature['output_files'] @abc.abstractmethod def _run(self, overwrite=False): """ This is the method to implement :param overwrite: (bool) if the output already exists, :return: out_files: files to be registered. Could be a list of files (pathlib.Path), a single file (pathlib.Path) an empty list [] or None. Whithin the pipeline, there is a distinction between a job that returns an empty list and a job that returns None. If the function returns None, the job will be labeled as "empty" status in the database, otherwise, the job has an expected behaviour of not returning any dataset. """ def setUp(self, **kwargs): """ Setup method to get the data handler and ensure all data is available locally to run task :param kwargs: :return: """ if self.location == 'server': self.get_signatures(**kwargs) input_status, _ = self.assert_expected_inputs(raise_error=False) output_status, _ = self.assert_expected(self.output_files, silent=True) if input_status: self.data_handler = self.get_data_handler() _logger.info('All input files found: running task') return True if not self.force: self.data_handler = self.get_data_handler() _logger.warning( 'Not all input files found locally: will still attempt to rerun task' ) # TODO in the future once we are sure that input output task signatures work properly should return False # _logger.info('All output files found but input files required not available locally: task not rerun') return True else: # Attempts to download missing data using globus _logger.info( 'Not all input files found locally: attempting to re-download required files' ) self.data_handler = self.get_data_handler( location='serverglobus') self.data_handler.setUp() # Double check we now have the required files to run the task # TODO in future should raise error if even after downloading don't have the correct files self.assert_expected_inputs(raise_error=False) return True else: self.data_handler = self.get_data_handler() self.data_handler.setUp() self.get_signatures(**kwargs) self.assert_expected_inputs() return True def tearDown(self): """ Function after runs() Does not run if a lock is encountered by the task (status -2) """ if self.gpu >= 1: if self._lock_file_path().exists(): self._lock_file_path().unlink() def cleanUp(self): """ Function to optionally overload to clean up :return: """ self.data_handler.cleanUp() def assert_expected_outputs(self, raise_error=True): """ After a run, asserts that all signature files are present at least once in the output files Mainly useful for integration tests :return: """ assert self.status == 0 _logger.info('Checking output files') everything_is_fine, files = self.assert_expected(self.output_files) if not everything_is_fine: for out in self.outputs: _logger.error(f"{out}") if raise_error: raise FileNotFoundError( "Missing outputs after task completion") return everything_is_fine, files def assert_expected_inputs(self, raise_error=True): """ Before running a task, check that all the files necessary to run the task have been downloaded/ are on the local file system already :return: """ _logger.info('Checking input files') everything_is_fine, files = self.assert_expected(self.input_files) if not everything_is_fine and raise_error: raise FileNotFoundError("Missing inputs to run task") return everything_is_fine, files def assert_expected(self, expected_files, silent=False): everything_is_fine = True files = [] for expected_file in expected_files: actual_files = list( Path(self.session_path).rglob( str(Path(expected_file[1]).joinpath(expected_file[0])))) if len(actual_files) == 0 and expected_file[2]: everything_is_fine = False if not silent: _logger.error( f"Signature file expected {expected_file} not found") else: if len(actual_files) != 0: files.append(actual_files[0]) return everything_is_fine, files def get_data_handler(self, location=None): """ Gets the relevant data handler based on location argument :return: """ location = location or self.location if location == 'local': return data_handlers.LocalDataHandler(self.session_path, self.signature, one=self.one) self.one = self.one or ONE() if location == 'server': dhandler = data_handlers.ServerDataHandler(self.session_path, self.signature, one=self.one) elif location == 'serverglobus': dhandler = data_handlers.ServerGlobusDataHandler(self.session_path, self.signature, one=self.one) elif location == 'remote': dhandler = data_handlers.RemoteHttpDataHandler(self.session_path, self.signature, one=self.one) elif location == 'AWS': dhandler = data_handlers.RemoteAwsDataHandler(self, self.session_path, self.signature, one=self.one) elif location == 'SDSC': dhandler = data_handlers.SDSCDataHandler(self, self.session_path, self.signature, one=self.one) return dhandler @staticmethod def make_lock_file(taskname="", time_out_secs=7200): """Creates a GPU lock file with a timeout of""" d = { 'start': time.time(), 'name': taskname, 'time_out_secs': time_out_secs } with open(Task._lock_file_path(), 'w+') as fid: json.dump(d, fid) return d @staticmethod def _lock_file_path(): """the lock file is in ~/.one/gpu.lock""" folder = Path.home().joinpath('.one') folder.mkdir(exist_ok=True) return folder.joinpath('gpu.lock') def _make_lock_file(self): """creates a lock file with the current time""" return Task.make_lock_file(self.name, self.time_out_secs) def is_locked(self): """Checks if there is a lock file for this given task""" lock_file = self._lock_file_path() if not lock_file.exists(): return False with open(lock_file) as fid: d = json.load(fid) now = time.time() if (now - d['start']) > d['time_out_secs']: lock_file.unlink() return False else: return True def _creates_lock(self): if self.is_locked(): return False else: self._make_lock_file() return True