def reset_states(config_file, sacred_id, filter_states, batch_id, filter_dict): collection = db_utils.get_collection_from_config(config_file) if sacred_id is None: if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0: detect_killed(config_file, verbose=False) filter_dict = db_utils.build_filter_dict(filter_states, batch_id, filter_dict) nreset = collection.count_documents(filter_dict) exps = collection.find(filter_dict) if nreset >= 10: if input(f"Resetting the state of {nreset} experiment{s_if(nreset)}. " f"Are you sure? (y/n) ").lower() != "y": exit() else: print(f"Resetting the state of {nreset} experiment{s_if(nreset)}.") for exp in exps: reset_experiment(collection, exp) else: exp = collection.find_one({'_id': sacred_id}) if exp is None: raise LookupError(f"No experiment found with ID {sacred_id}.") else: print(f"Resetting the state of experiment with ID {sacred_id}.") reset_experiment(collection, exp)
def delete_experiments(config_file, sacred_id, filter_states, batch_id, filter_dict): collection = db_utils.get_collection_from_config(config_file) if sacred_id is None: if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0: detect_killed(config_file, verbose=False) filter_dict = db_utils.build_filter_dict(filter_states, batch_id, filter_dict) ndelete = collection.count_documents(filter_dict) if ndelete >= 10: if input( f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection. " f"Are you sure? (y/n) ").lower() != "y": exit() else: print( f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection." ) collection.delete_many(filter_dict) else: if collection.find_one({'_id': sacred_id}) is None: raise LookupError(f"No experiment found with ID {sacred_id}.") else: print(f"Deleting experiment with ID {sacred_id}.") collection.delete_one({'_id': sacred_id})
def start_experiments(config_file, local, sacred_id, batch_id, filter_dict, test, verbose): use_slurm = not local db_collection_name = db_utils.read_config(config_file)[0]['db_collection'] if test != -1: verbose = True if sacred_id is None: filter_dict = db_utils.build_filter_dict([], batch_id, filter_dict) else: filter_dict = {'_id': sacred_id} do_work(db_collection_name, verbose, slurm=use_slurm, num_exps=test, filter_dict=filter_dict)
def start_experiments(config_file, local, sacred_id, batch_id, filter_dict, test, unobserved, post_mortem, debug, verbose, dry_run): use_slurm = not local db_collection_name = db_utils.read_config(config_file)[0]['db_collection'] if debug: test = 1 use_slurm = False unobserved = True post_mortem = True if test != -1: verbose = True if sacred_id is None: filter_dict = db_utils.build_filter_dict([], batch_id, filter_dict) else: filter_dict = {'_id': sacred_id} if dry_run: print_commands(db_collection_name, log_verbose=verbose, unobserved=unobserved, post_mortem=post_mortem, num_exps=test, filter_dict=filter_dict) else: do_work(db_collection_name, log_verbose=verbose, slurm=use_slurm, unobserved=unobserved, post_mortem=post_mortem, num_exps=test, filter_dict=filter_dict, dry_run=dry_run)
def cancel_experiments(config_file, sacred_id, filter_states, batch_id, filter_dict): """ Cancel experiments. Parameters ---------- config_file: str Path to the configuration YAML file. sacred_id: int or None ID of the experiment to cancel. If None, will use the other arguments to cancel possible multiple experiments. filter_states: list of strings or None List of statuses to filter for. Will cancel all jobs from the database collection with one of the given statuses. batch_id: int or None The ID of the batch of experiments to cancel. All experiments that are queued together (i.e. within the same command line call) have the same batch ID. filter_dict: dict or None Arbitrary filter dictionary to use for cancelling experiments. Any experiments whose database entries match all keys/values of the dictionary will be cancelled. Returns ------- None """ collection = db_utils.get_collection_from_config(config_file) if sacred_id is None: # no ID is provided: we check whether there are slurm jobs for which after this action no # RUNNING experiment remains. These slurm jobs can be killed altogether. # However, it is NOT possible right now to cancel a single experiment in a Slurm job with multiple # running experiments. try: if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0: detect_killed(config_file, verbose=False) filter_dict = db_utils.build_filter_dict(filter_states, batch_id, filter_dict) ncancel = collection.count_documents(filter_dict) if ncancel >= 10: if input(f"Cancelling {ncancel} experiment{s_if(ncancel)}. " f"Are you sure? (y/n) ").lower() != "y": exit() else: print(f"Cancelling {ncancel} experiment{s_if(ncancel)}.") exps = list(collection.find(filter_dict)) # set of slurm IDs in the database slurm_ids = set([e['slurm']['id'] for e in exps if "slurm" in e and 'id' in e['slurm']]) # set of experiment IDs to be cancelled. exp_ids = set([e['_id'] for e in exps]) to_cancel = set() # iterate over slurm IDs to check which slurm jobs can be cancelled altogether for s_id in slurm_ids: # find experiments RUNNING under the slurm job jobs_running = list(collection.find({'slurm.id': s_id, 'status' : {"$in": ["RUNNING"]}}, {"_id": 1})) running_exp_ids = set(e['_id'] for e in jobs_running) if len(running_exp_ids.difference(exp_ids)) == 0: # there are no running jobs in this slurm job that should not be canceled. to_cancel.add(str(s_id)) # cancel all Slurm jobs for which no running experiment remains. if len(to_cancel) > 0: subprocess.check_output(f"scancel {' '.join(list(to_cancel))}", shell=True) # update database status and write the stop_time collection.update_many(filter_dict, {'$set': {"status": "INTERRUPTED", "stop_time": datetime.datetime.utcnow()}}) except subprocess.CalledProcessError: warnings.warn(f"One or multiple Slurm jobs were no longer running when I tried to cancel them.") else: print(f"Cancelling experiment with ID {sacred_id}.") cancel_experiment_by_id(collection, sacred_id)