def test_model_sizes_for_all_embedder(pytestconfig): """Make sure we have the model sizes documented for each model If this test is failing, run the following and enter the results in bio_embeddings/embed/__init__.py: ``` python -m bio_embeddings.utilities.model_size_main cpu python -m bio_embeddings.utilities.model_size_main gpu ``` """ models = read_config_file( pytestconfig.rootpath.joinpath( "bio_embeddings/utilities/defaults.yml")) set(models.keys()) doc_text: str = pytestconfig.rootpath.joinpath( "bio_embeddings/embed/__init__.py").read_text() # Quick and stupid rst parsing documented_embedder = set() for line in doc_text.split("=" * 46)[2].splitlines()[1:]: documented_embedder.add(line.split(" ")[0]) assert name_to_embedder.keys() - set(documented_embedder) == set() # Handle the non-embedder models assert set(documented_embedder) - name_to_embedder.keys() == { "bert_from_publication", "deepblast", "pb_tucker", "seqvec_from_publication", }
def parse_config_file_and_execute_run(config_file_path: str, **kwargs): _validate_file(config_file_path) # read configuration and execute config = read_config_file(config_file_path) execute_pipeline_from_config(config, **kwargs)
def test_wrong_model_param(pytestconfig, tmp_path: Path, caplog): """In this config, the protocol esm1b is chosen, but instead of a model_file a model_directory for T5 is given""" pipeline_config = read_config_file( str(pytestconfig.rootpath.joinpath("test-data/embed_config_mixup.yml")) ) pipeline_config["global"]["sequences_file"] = str( pytestconfig.rootpath.joinpath("test-data").joinpath( pipeline_config["global"]["sequences_file"] ) ) pipeline_config["global"]["prefix"] = str( tmp_path.joinpath(pipeline_config["global"]["prefix"]) ) with mock.patch( "bio_embeddings.embed.pipeline.name_to_embedder", {"esm1b": MockESM1bEmbedder} ), mock.patch( "bio_embeddings.embed.embedder_interfaces.get_model_file", return_value="/dev/null", ): execute_pipeline_from_config(pipeline_config) assert caplog.messages == [ "You set an unknown option for esm1b: model_directory (value: /mnt/project/bio_embeddings/models/lms/t5)" ]
def parse_config_file_and_execute_run(config_file_path: str, **kwargs): if not _valid_file(config_file_path): raise Exception("No config or invalid config was passed.") # read configuration and execute config = read_config_file(config_file_path) execute_pipeline_from_config(config, **kwargs)
def read_and_patch_config( pytestconfig, tmp_path: Path, yml_file: str ) -> Dict[str, Any]: pipeline_config = read_config_file(str(pytestconfig.rootpath.joinpath(yml_file))) pipeline_config["global"]["sequences_file"] = str( pytestconfig.rootpath.joinpath("test-data").joinpath( pipeline_config["global"]["sequences_file"] ) ) pipeline_config["global"]["prefix"] = str( tmp_path.joinpath(pipeline_config["global"]["prefix"]) ) return pipeline_config
import shutil from pathlib import Path from typing import Dict, Optional from urllib import request from appdirs import user_cache_dir from atomicwrites import atomic_write from tqdm import tqdm from bio_embeddings.utilities.config import read_config_file _module_dir: Path = Path(os.path.dirname(os.path.abspath(__file__))) _defaults: Dict[str, Dict[str, str]] = read_config_file(_module_dir / "defaults.yml") logger = logging.getLogger(__name__) class TqdmUpTo(tqdm): """Provides `update_to(n)` which uses `tqdm.update(delta_n)`.""" def update_to(self, b=1, bsize=1, tsize=None): """ b : int, optional Number of blocks transferred so far [default: 1]. bsize : int, optional Size of each block (in tqdm units) [default: 1]. tsize : int, optional Total size (in tqdm units). If [default: None] remains unchanged. """
# Args parser = argparse.ArgumentParser( description='Embeds random subset of a sequence file.') parser.add_argument( 'config_path', metavar='/path/to/pipeline_definition.yml', type=str, nargs=1, help= 'The path to the config. For examples, see folder "parameter examples".') arguments = parser.parse_args() # Options config_path = arguments.config_path[0] config = read_config_file(config_path) sequence_path = config['global']['sequences_file'] max_number_of_sequences = config['global'].get('max_number_of_sequences', 250) max_len = config['global'].get('max_len', 100) min_len = config['global'].get('min_len', 50) # filtered_sequences = list() total_aa = 0 # for seq_record in SeqIO.parse(sequence_path, "fasta"): if max_len > len(seq_record) > min_len: filtered_sequences.append(seq_record) random_sample = random.sample(filtered_sequences, max_number_of_sequences)