Пример #1
0
def test_model_sizes_for_all_embedder(pytestconfig):
    """Make sure we have the model sizes documented for each model

    If this test is failing, run the following and enter the results in bio_embeddings/embed/__init__.py:

    ```
    python -m bio_embeddings.utilities.model_size_main cpu
    python -m bio_embeddings.utilities.model_size_main gpu
    ```
    """
    models = read_config_file(
        pytestconfig.rootpath.joinpath(
            "bio_embeddings/utilities/defaults.yml"))
    set(models.keys())
    doc_text: str = pytestconfig.rootpath.joinpath(
        "bio_embeddings/embed/__init__.py").read_text()
    # Quick and stupid rst parsing
    documented_embedder = set()
    for line in doc_text.split("=" * 46)[2].splitlines()[1:]:
        documented_embedder.add(line.split(" ")[0])
    assert name_to_embedder.keys() - set(documented_embedder) == set()
    # Handle the non-embedder models
    assert set(documented_embedder) - name_to_embedder.keys() == {
        "bert_from_publication",
        "deepblast",
        "pb_tucker",
        "seqvec_from_publication",
    }
def main():
    parser = ArgumentParser()
    parser.add_argument("embedder", help="Lowercase name of the embedder")
    parser.add_argument("--model-directory")
    args = parser.parse_args()
    if args.embedder == "all":
        for embedder_name in name_to_embedder.keys():
            bisect_embedder_memory(
                embedder_name,
                str(Path(args.model_directory).joinpath(embedder_name)))
    else:
        bisect_embedder_memory(args.embedder, args.model_directory)
Пример #3
0
def main():
    parser = ArgumentParser()
    parser.add_argument("embedder", help="Lowercase name of the embedder")
    parser.add_argument("--model-directory")
    parser.add_argument("--half-precision-model",
                        action="store_true",
                        default=False)
    args = parser.parse_args()
    print(args)
    if args.embedder == "all":
        for embedder_name in name_to_embedder.keys():
            bisect_embedder_memory(
                embedder_name,
                str(Path(args.model_directory).joinpath(embedder_name)),
                half_precision_model=args.half_precision_model,
            )
    else:
        bisect_embedder_memory(
            args.embedder,
            args.model_directory,
            half_precision_model=args.half_precision_model,
        )
Пример #4
0
def run(**kwargs):
    """
    Run embedding protocol

    Parameters
    ----------
    kwargs arguments (* denotes optional):
        sequences_file: Where sequences live
        prefix: Output prefix for all generated files
        protocol: Which embedder to use
        mapping_file: the mapping file generated by the pipeline when remapping indexes
        stage_name: The stage name

    Returns
    -------
    Dictionary with results of stage
    """
    check_required(
        kwargs,
        ["protocol", "prefix", "stage_name", "remapped_sequences_file", "mapping_file"],
    )

    if kwargs["protocol"] not in name_to_embedder:
        if kwargs["protocol"] in ALL_PROTOCOLS:
            raise InvalidParameterError(
                f"The extra for the protocol {kwargs['protocol']} is missing. "
                "See https://docs.bioembeddings.com/#installation on how to install all extras"
            )
        raise InvalidParameterError(
            "Invalid protocol selection: {}. Valid protocols are: {}".format(
                kwargs["protocol"], ", ".join(name_to_embedder.keys())
            )
        )

    embedder_class = name_to_embedder[kwargs["protocol"]]

    if kwargs["protocol"] == "unirep" and kwargs.get("use_cpu") is not None:
        raise InvalidParameterError("UniRep does not support configuring `use_cpu`")

    result_kwargs = deepcopy(kwargs)

    # Download necessary files if needed
    # noinspection PyProtectedMember
    for file in embedder_class._necessary_files:
        if not result_kwargs.get(file):
            result_kwargs[file] = get_model_file(model=embedder_class.name, file=file)

    # noinspection PyProtectedMember
    for directory in embedder_class._necessary_directories:
        if not result_kwargs.get(directory):
            result_kwargs[directory] = get_model_directories_from_zip(
                model=embedder_class.name, directory=directory
            )

    result_kwargs.setdefault("max_amino_acids", DEFAULT_MAX_AMINO_ACIDS[kwargs["protocol"]])

    file_manager = get_file_manager(**kwargs)
    embedder: EmbedderInterface = embedder_class(**result_kwargs)
    _check_transform_embeddings_function(embedder, result_kwargs)

    return embed_and_write_batched(embedder, file_manager, result_kwargs, kwargs.get("half_precision", False))
Пример #5
0
def prepare_kwargs(**kwargs):
    required_kwargs = [
        "protocol",
        "prefix",
        "stage_name",
        "remapped_sequences_file",
        "mapping_file",
    ]
    check_required(kwargs, required_kwargs)

    if kwargs["protocol"] not in name_to_embedder:
        if kwargs["protocol"] in ALL_PROTOCOLS:
            raise InvalidParameterError(
                f"The extra for the protocol {kwargs['protocol']} is missing. "
                "See https://docs.bioembeddings.com/#installation on how to install all extras"
            )
        raise InvalidParameterError(
            "Invalid protocol selection: {}. Valid protocols are: {}".format(
                kwargs["protocol"], ", ".join(name_to_embedder.keys())))

    embedder_class = name_to_embedder[kwargs["protocol"]]

    if kwargs["protocol"] == "unirep" and kwargs.get("use_cpu") is not None:
        raise InvalidParameterError(
            "UniRep does not support configuring `use_cpu`")
    # See parameter_blueprints.yml
    global_options = {"sequences_file", "simple_remapping", "start_time"}
    embed_options = {
        "decoder",
        "device",
        "discard_per_amino_acid_embeddings",
        "half_precision_model",
        "half_precision",
        "max_amino_acids",
        "reduce",
        "type",
    }
    known_parameters = (set(required_kwargs)
                        | global_options
                        | embed_options
                        | set(embedder_class.necessary_files)
                        | set(embedder_class.necessary_directories))
    if embedder_class == "seqvec":
        # We support two ways of configuration for seqvec
        known_parameters.add("model_directory")
    if not set(kwargs) < known_parameters:
        # Complain louder if the input looks fishier
        for option in set(kwargs) - known_parameters:
            logger.warning(
                f"You set an unknown option for {embedder_class.name}: {option} (value: {kwargs[option]})"
            )

    if kwargs.get("half_precision_model"):
        if kwargs["protocol"] not in [
                "prottrans_t5_bfd", "prottrans_t5_uniref50"
        ]:
            raise InvalidParameterError(
                "`half_precision_model` is only supported with prottrans_t5_bfd and prottrans_t5_uniref50"
            )

        if kwargs.get("half_precision") is False:  # None remains allowed
            raise InvalidParameterError(
                "You can't have `half_precision_model` be true and `half_precision` be false. "
                "We suggest also setting `half_precision` to true, "
                "which will compute and save embeddings as half-precision floats"
            )

    result_kwargs = deepcopy(kwargs)
    result_kwargs.setdefault("max_amino_acids",
                             DEFAULT_MAX_AMINO_ACIDS[kwargs["protocol"]])

    return embedder_class, result_kwargs