Exemplo n.º 1
0
def test_model_sizes_for_all_embedder(pytestconfig):
    """Make sure we have the model sizes documented for each model

    If this test is failing, run the following and enter the results in bio_embeddings/embed/__init__.py:

    ```
    python -m bio_embeddings.utilities.model_size_main cpu
    python -m bio_embeddings.utilities.model_size_main gpu
    ```
    """
    models = read_config_file(
        pytestconfig.rootpath.joinpath(
            "bio_embeddings/utilities/defaults.yml"))
    set(models.keys())
    doc_text: str = pytestconfig.rootpath.joinpath(
        "bio_embeddings/embed/__init__.py").read_text()
    # Quick and stupid rst parsing
    documented_embedder = set()
    for line in doc_text.split("=" * 46)[2].splitlines()[1:]:
        documented_embedder.add(line.split(" ")[0])
    assert name_to_embedder.keys() - set(documented_embedder) == set()
    # Handle the non-embedder models
    assert set(documented_embedder) - name_to_embedder.keys() == {
        "bert_from_publication",
        "deepblast",
        "pb_tucker",
        "seqvec_from_publication",
    }
Exemplo n.º 2
0
def parse_config_file_and_execute_run(config_file_path: str, **kwargs):
    _validate_file(config_file_path)

    # read configuration and execute
    config = read_config_file(config_file_path)

    execute_pipeline_from_config(config, **kwargs)
Exemplo n.º 3
0
def test_wrong_model_param(pytestconfig, tmp_path: Path, caplog):
    """In this config, the protocol esm1b is chosen, but instead of a model_file a model_directory for T5 is given"""
    pipeline_config = read_config_file(
        str(pytestconfig.rootpath.joinpath("test-data/embed_config_mixup.yml"))
    )
    pipeline_config["global"]["sequences_file"] = str(
        pytestconfig.rootpath.joinpath("test-data").joinpath(
            pipeline_config["global"]["sequences_file"]
        )
    )
    pipeline_config["global"]["prefix"] = str(
        tmp_path.joinpath(pipeline_config["global"]["prefix"])
    )

    with mock.patch(
        "bio_embeddings.embed.pipeline.name_to_embedder", {"esm1b": MockESM1bEmbedder}
    ), mock.patch(
        "bio_embeddings.embed.embedder_interfaces.get_model_file",
        return_value="/dev/null",
    ):
        execute_pipeline_from_config(pipeline_config)

    assert caplog.messages == [
        "You set an unknown option for esm1b: model_directory (value: /mnt/project/bio_embeddings/models/lms/t5)"
    ]
Exemplo n.º 4
0
def parse_config_file_and_execute_run(config_file_path: str, **kwargs):
    if not _valid_file(config_file_path):
        raise Exception("No config or invalid config was passed.")

    # read configuration and execute
    config = read_config_file(config_file_path)

    execute_pipeline_from_config(config, **kwargs)
Exemplo n.º 5
0
def read_and_patch_config(
    pytestconfig, tmp_path: Path, yml_file: str
) -> Dict[str, Any]:
    pipeline_config = read_config_file(str(pytestconfig.rootpath.joinpath(yml_file)))
    pipeline_config["global"]["sequences_file"] = str(
        pytestconfig.rootpath.joinpath("test-data").joinpath(
            pipeline_config["global"]["sequences_file"]
        )
    )
    pipeline_config["global"]["prefix"] = str(
        tmp_path.joinpath(pipeline_config["global"]["prefix"])
    )
    return pipeline_config
Exemplo n.º 6
0
import shutil

from pathlib import Path
from typing import Dict, Optional
from urllib import request

from appdirs import user_cache_dir
from atomicwrites import atomic_write
from tqdm import tqdm

from bio_embeddings.utilities.config import read_config_file

_module_dir: Path = Path(os.path.dirname(os.path.abspath(__file__)))
_defaults: Dict[str,
                Dict[str,
                     str]] = read_config_file(_module_dir / "defaults.yml")

logger = logging.getLogger(__name__)


class TqdmUpTo(tqdm):
    """Provides `update_to(n)` which uses `tqdm.update(delta_n)`."""
    def update_to(self, b=1, bsize=1, tsize=None):
        """
        b  : int, optional
            Number of blocks transferred so far [default: 1].
        bsize  : int, optional
            Size of each block (in tqdm units) [default: 1].
        tsize  : int, optional
            Total size (in tqdm units). If [default: None] remains unchanged.
        """
Exemplo n.º 7
0
# Args
parser = argparse.ArgumentParser(
    description='Embeds random subset of a sequence file.')

parser.add_argument(
    'config_path',
    metavar='/path/to/pipeline_definition.yml',
    type=str,
    nargs=1,
    help=
    'The path to the config. For examples, see folder "parameter examples".')
arguments = parser.parse_args()
# Options
config_path = arguments.config_path[0]
config = read_config_file(config_path)
sequence_path = config['global']['sequences_file']
max_number_of_sequences = config['global'].get('max_number_of_sequences', 250)
max_len = config['global'].get('max_len', 100)
min_len = config['global'].get('min_len', 50)
#
filtered_sequences = list()
total_aa = 0
#

for seq_record in SeqIO.parse(sequence_path, "fasta"):
    if max_len > len(seq_record) > min_len:
        filtered_sequences.append(seq_record)

random_sample = random.sample(filtered_sequences, max_number_of_sequences)