示例#1
0
文件: main.py 项目: akeskiner/seml
def reset_states(config_file, sacred_id, filter_states, batch_id, filter_dict):
    collection = db_utils.get_collection_from_config(config_file)

    if sacred_id is None:
        if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
            detect_killed(config_file, verbose=False)

        filter_dict = db_utils.build_filter_dict(filter_states, batch_id, filter_dict)

        nreset = collection.count_documents(filter_dict)
        exps = collection.find(filter_dict)

        if nreset >= 10:
            if input(f"Resetting the state of {nreset} experiment{s_if(nreset)}. "
                     f"Are you sure? (y/n) ").lower() != "y":
                exit()
        else:
            print(f"Resetting the state of {nreset} experiment{s_if(nreset)}.")
        for exp in exps:
            reset_experiment(collection, exp)
    else:
        exp = collection.find_one({'_id': sacred_id})
        if exp is None:
            raise LookupError(f"No experiment found with ID {sacred_id}.")
        else:
            print(f"Resetting the state of experiment with ID {sacred_id}.")
            reset_experiment(collection, exp)
示例#2
0
def delete_experiments(config_file, sacred_id, filter_states, batch_id,
                       filter_dict):
    collection = db_utils.get_collection_from_config(config_file)
    if sacred_id is None:
        if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
            detect_killed(config_file, verbose=False)

        filter_dict = db_utils.build_filter_dict(filter_states, batch_id,
                                                 filter_dict)
        ndelete = collection.count_documents(filter_dict)

        if ndelete >= 10:
            if input(
                    f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection. "
                    f"Are you sure? (y/n) ").lower() != "y":
                exit()
        else:
            print(
                f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection."
            )
        collection.delete_many(filter_dict)
    else:
        if collection.find_one({'_id': sacred_id}) is None:
            raise LookupError(f"No experiment found with ID {sacred_id}.")
        else:
            print(f"Deleting experiment with ID {sacred_id}.")
            collection.delete_one({'_id': sacred_id})
示例#3
0
def detect_killed(config_file, verbose=True):
    collection = db_utils.get_collection_from_config(config_file)
    exps = collection.find({'status': {'$in': ['PENDING', 'RUNNING']}})
    running_jobs = get_slurm_jobs()
    nkilled = 0
    for exp in exps:
        if 'slurm' in exp and 'id' in exp['slurm'] and exp['slurm'][
                'id'] not in running_jobs:
            nkilled += 1
            collection.update_one({'_id': exp['_id']},
                                  {'$set': {
                                      'status': 'KILLED'
                                  }})
            try:
                with open(exp['slurm']['output_file'], 'r') as f:
                    all_lines = f.readlines()
                collection.update_one({'_id': exp['_id']},
                                      {'$set': {
                                          'fail_trace': all_lines[-4:]
                                      }})
            except IOError:
                print(
                    f"Warning: file {exp['slurm']['output_file']} could not be read."
                )
    if verbose:
        print(
            f"Detected {nkilled} externally killed experiment{s_if(nkilled)}.")
示例#4
0
def clean_unreferenced_artifacts(config_file, all_collections=False):
    """
    Delete orphaned artifacts from the database. That is, artifacts that were generated by experiments, but whose
    experiment's database entry has been removed. This leads to storage accumulation, and this function cleans this
    excess storage.
    Parameters
    ----------
    config_file: str
        config file containing the collection to be scanned.
    all_collections: bool
        If yes, will scan ALL collections (not just the one provided in the config file) for orphaned artifacts.

    Returns
    -------
    None
    """
    import gridfs
    if all_collections:
        config = db_utils.get_mongodb_config()
        db = db_utils.get_database(**config)
        collection_names = db.list_collection_names()
    else:
        collection = db_utils.get_collection_from_config(config_file)
        db = collection.database
        collection_names = [collection.name]

    fs = gridfs.GridFS(db)
    referenced_artifact_ids = set()
    for collection_name in collection_names:
        collection = db[collection_name]
        all_artifacts = list(collection.find({}, {'artifacts': 1}))
        all_artifacts = [
            x['artifacts'] for x in all_artifacts if 'artifacts' in x
        ]
        all_artifacts_flat = [art for x in all_artifacts for art in x]
        artifact_ids = set([x['file_id'] for x in all_artifacts_flat])
        referenced_artifact_ids = referenced_artifact_ids.union(artifact_ids)

    artifacts_in_db = list(db['fs.files'].find({}, {'_id': 1}))
    artifacts_in_db = set([x['_id'] for x in artifacts_in_db])
    not_referenced_artifacts = artifacts_in_db - referenced_artifact_ids
    n_delete = len(not_referenced_artifacts)
    if input(
            f"Deleting {n_delete} not referenced artifact{s_if(n_delete)} from database {db.name}. "
            f"Are you sure? (y/n) ").lower() != "y":
        exit()
    print('Deleting not referenced artifacts...')
    for to_delete in tqdm(not_referenced_artifacts):
        fs.delete(to_delete)
    print(
        f'Successfully deleted {n_delete} not referenced artifact{s_if(n_delete)}.'
    )
示例#5
0
文件: main.py 项目: akeskiner/seml
def report_status(config_file):
    detect_killed(config_file, verbose=False)
    collection = db_utils.get_collection_from_config(config_file)
    queued = collection.count_documents({'status': 'QUEUED'})
    pending = collection.count_documents({'status': 'PENDING'})
    failed = collection.count_documents({'status': 'FAILED'})
    killed = collection.count_documents({'status': 'KILLED'})
    interrupted = collection.count_documents({'status': 'INTERRUPTED'})
    running = collection.count_documents({'status': 'RUNNING'})
    completed = collection.count_documents({'status': 'COMPLETED'})
    title = "********** Experiment database collection report **********"
    print(title)
    print(f"*     - {queued:3d} queued experiment{s_if(queued)}")
    print(f"*     - {pending:3d} pending experiment{s_if(pending)}")
    print(f"*     - {running:3d} running experiment{s_if(running)}")
    print(f"*     - {completed:3d} completed experiment{s_if(completed)}")
    print(f"*     - {interrupted:3d} interrupted experiment{s_if(interrupted)}")
    print(f"*     - {failed:3d} failed experiment{s_if(failed)}")
    print(f"*     - {killed:3d} killed experiment{s_if(killed)}")
    print("*" * len(title))
示例#6
0
def detect_killed(config_file, verbose=True):
    collection = db_utils.get_collection_from_config(config_file)
    exps = collection.find({'status': {'$in': ['PENDING', 'RUNNING']}})
    running_jobs = get_slurm_jobs()
    nkilled = 0
    for exp in exps:
        if 'slurm' in exp and 'id' in exp['slurm'] and exp['slurm'][
                'id'] not in running_jobs:
            nkilled += 1
            collection.update_one({'_id': exp['_id']},
                                  {'$set': {
                                      'status': 'KILLED'
                                  }})
            try:
                slurm_config = exp['slurm']
                seml_config = exp['seml']
                if 'output_file' in seml_config:
                    output_file = seml_config['output_file']
                elif 'output_file' in slurm_config:  # backward compatibility, we used to store the path in 'slurm'
                    output_file = slurm_config['output_file']
                else:
                    continue
                with open(output_file, 'r') as f:
                    all_lines = f.readlines()
                collection.update_one({'_id': exp['_id']},
                                      {'$set': {
                                          'fail_trace': all_lines[-4:]
                                      }})
            except IOError:
                if 'output_file' in seml_config:
                    output_file = seml_config['output_file']
                elif 'output_file' in slurm_config:  # backward compatibility
                    output_file = slurm_config['output_file']
                print(f"Warning: file {output_file} could not be read.")
    if verbose:
        print(
            f"Detected {nkilled} externally killed experiment{s_if(nkilled)}.")
示例#7
0
文件: main.py 项目: akeskiner/seml
def cancel_experiments(config_file, sacred_id, filter_states, batch_id, filter_dict):
    """
    Cancel experiments.

    Parameters
    ----------
    config_file: str
        Path to the configuration YAML file.
    sacred_id: int or None
        ID of the experiment to cancel. If None, will use the other arguments to cancel possible multiple experiments.
    filter_states: list of strings or None
        List of statuses to filter for. Will cancel all jobs from the database collection
        with one of the given statuses.
    batch_id: int or None
        The ID of the batch of experiments to cancel. All experiments that are queued together (i.e. within the same
        command line call) have the same batch ID.
    filter_dict: dict or None
        Arbitrary filter dictionary to use for cancelling experiments. Any experiments whose database entries match all
        keys/values of the dictionary will be cancelled.

    Returns
    -------
    None

    """
    collection = db_utils.get_collection_from_config(config_file)
    if sacred_id is None:
        # no ID is provided: we check whether there are slurm jobs for which after this action no
        # RUNNING experiment remains. These slurm jobs can be killed altogether.
        # However, it is NOT possible right now to cancel a single experiment in a Slurm job with multiple
        # running experiments.
        try:
            if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
                detect_killed(config_file, verbose=False)

            filter_dict = db_utils.build_filter_dict(filter_states, batch_id, filter_dict)

            ncancel = collection.count_documents(filter_dict)
            if ncancel >= 10:
                if input(f"Cancelling {ncancel} experiment{s_if(ncancel)}. "
                         f"Are you sure? (y/n) ").lower() != "y":
                    exit()
            else:
                print(f"Cancelling {ncancel} experiment{s_if(ncancel)}.")

            exps = list(collection.find(filter_dict))
            # set of slurm IDs in the database
            slurm_ids = set([e['slurm']['id'] for e in exps if "slurm" in e and 'id' in e['slurm']])
            # set of experiment IDs to be cancelled.
            exp_ids = set([e['_id'] for e in exps])
            to_cancel = set()

            # iterate over slurm IDs to check which slurm jobs can be cancelled altogether
            for s_id in slurm_ids:
                # find experiments RUNNING under the slurm job
                jobs_running = list(collection.find({'slurm.id': s_id,
                                                     'status'  : {"$in": ["RUNNING"]}},
                                                    {"_id": 1}))
                running_exp_ids = set(e['_id'] for e in jobs_running)
                if len(running_exp_ids.difference(exp_ids)) == 0:
                    # there are no running jobs in this slurm job that should not be canceled.
                    to_cancel.add(str(s_id))

            # cancel all Slurm jobs for which no running experiment remains.
            if len(to_cancel) > 0:
                subprocess.check_output(f"scancel {' '.join(list(to_cancel))}", shell=True)

            # update database status and write the stop_time
            collection.update_many(filter_dict, {'$set': {"status": "INTERRUPTED",
                                                          "stop_time": datetime.datetime.utcnow()}})
        except subprocess.CalledProcessError:
            warnings.warn(f"One or multiple Slurm jobs were no longer running when I tried to cancel them.")
    else:
        print(f"Cancelling experiment with ID {sacred_id}.")
        cancel_experiment_by_id(collection, sacred_id)