示例#1
0
def download_resfam(db_dir: str) -> None:
    """Download and sanitise the Resfam database."""
    archive_filename = os.path.join(db_dir, "resfam", "Resfams.hmm.gz")
    filename = os.path.splitext(archive_filename)[0]
    url = RESFAM_URL

    # checksum of existing not matched because it has a convert timestamp in it
    # So check size and line count as an approximation
    if present_and_size_matches(filename, RESFAM_SIZE) and \
       present_and_line_count_matches(filename, RESFAM_LINES):
        print("Resfams database present and checked")
        return

    print("Downloading Resfam database")
    check_diskspace(url)
    download_if_not_present(url, archive_filename, RESFAM_ARCHIVE_CHECKSUM)
    filename = unzip_file(archive_filename, gzip, gzip.zlib.error)  # type: ignore
    delete_file(filename + ".gz")
    # remove tabs
    converted = execute(["hmmconvert", filename])
    print("Ensuring all cutoffs are present")
    # add TC to those entries missing them
    # calculated as 10% less than the minimum scoring hit in their own group
    missing_cutoffs = {
        "RF0174": int(374 * 0.9),
        "RF0172": int(85 * 0.9),
        "RF0173": int(295 * 0.9),
        "RF0168": int(691 * 0.9),
    }
    with open(filename, "w") as handle:
        lines = list(converted.stdout)
        i = 0
        while i < len(lines):
            # find an accession
            while i < len(lines) and not lines[i].startswith("ACC"):
                handle.write(lines[i])
                i += 1
            # end of file with no new accession
            if i >= len(lines):
                break
            # write the accession line itself
            handle.write(lines[i])

            # add the cutoffs if missing
            acc = lines[i].split()[1]
            if acc not in missing_cutoffs:
                continue
            value = missing_cutoffs[acc]
            # an accession of interest, so add cutoffs in the same place as others
            while not lines[i].startswith("CKSUM"):
                handle.write(lines[i])
                i += 1
            # write the CKSUM line
            handle.write(lines[i])
            # and finally add the cutoffs
            for cutoff in ["GA", "TC", "NC"]:
                handle.write("%s    %d.00 %d.00\n" % (cutoff, value, value))
            i += 1

    ensure_database_pressed(filename)
示例#2
0
def download_pfam(db_dir: str, url: str, version: str, archive_checksum: str, db_checksum: str) -> None:
    """Download and compile the PFAM database."""
    archive_filename = os.path.join(db_dir, "pfam", version, "Pfam-A.hmm.gz")
    db_filename = os.path.splitext(archive_filename)[0]

    if present_and_checksum_matches(db_filename, db_checksum):
        print("PFAM file present and ok for version", version)
        return

    print("Downloading PFAM version", version)
    check_diskspace(url)
    download_if_not_present(url, archive_filename, archive_checksum)
    filename = unzip_file(archive_filename, gzip, gzip.zlib.error)  # type: ignore
    ensure_database_pressed(filename)
    delete_file(filename + ".gz")
示例#3
0
def download_tigrfam(db_dir: str) -> None:
    """Download the TIGRFam database."""
    archive_filename = os.path.join(db_dir, "tigrfam", "TIGRFam.hmm.gz")
    filename = os.path.splitext(archive_filename)[0]

    if present_and_checksum_matches(filename, TIGRFAM_CHECKSUM):
        print("TIGRFam database present and checked")
    else:
        print("Downloading TIGRFam database")
        check_diskspace(TIGRFAM_URL)
        download_if_not_present(TIGRFAM_URL, archive_filename,
                                TIGRFAM_ARCHIVE_CHECKSUM)
        filename = unzip_file(archive_filename, gzip,
                              gzip.zlib.error)  # type: ignore
        delete_file(archive_filename)

    ensure_database_pressed(filename)
示例#4
0
def prepare_data(logging_only: bool = False) -> List[str]:
    """ Ensures packaged data is fully prepared

        Arguments:
            logging_only: whether to return error messages instead of raising exceptions

        Returns:
            a list of error messages (only if logging_only is True)
    """
    failure_messages = []

    # Check that hmmdetails.txt is readable and well-formatted
    try:
        profiles = get_signature_profiles()
    except ValueError as err:
        if not logging_only:
            raise
        return [str(err)]

    # the path to the markov model
    seeds_hmm = path.get_full_path(__file__, 'data', 'bgc_seeds.hmm')
    hmm_files = [os.path.join("data", sig.hmm_file) for sig in profiles]
    outdated = False
    if not path.locate_file(seeds_hmm):
        logging.debug("%s: %s doesn't exist, regenerating", NAME, seeds_hmm)
        outdated = True
    else:
        seeds_timestamp = os.path.getmtime(seeds_hmm)
        for component in hmm_files:
            if os.path.getmtime(component) > seeds_timestamp:
                logging.debug("%s out of date, regenerating", seeds_hmm)
                outdated = True
                break

    # regenerate if missing or out of date
    if outdated:
        # try to generate file from all specified profiles in hmmdetails
        try:
            with open(seeds_hmm, 'w') as all_hmms_handle:
                for hmm_file in hmm_files:
                    with open(path.get_full_path(__file__, hmm_file),
                              'r') as handle:
                        all_hmms_handle.write(handle.read())
        except OSError:
            if not logging_only:
                raise
            failure_messages.append(
                'Failed to generate file {!r}'.format(seeds_hmm))

    # if regeneration failed, don't try to run hmmpress
    if failure_messages:
        return failure_messages

    failure_messages.extend(
        hmmer.ensure_database_pressed(seeds_hmm,
                                      return_not_raise=logging_only))

    return failure_messages
def prepare_data(logging_only: bool = False) -> List[str]:
    """ Ensures packaged data is fully prepared

        Arguments:
            logging_only: whether to return error messages instead of raising exceptions

        Returns:
            a list of error messages (only if logging_only is True)
    """
    database = path.get_full_path(__file__, 'data', 'RREFam.hmm')
    return ensure_database_pressed(database, return_not_raise=logging_only)
示例#6
0
def prepare_data(logging_only: bool = False) -> List[str]:
    """ Ensures packaged data is fully prepared

        Arguments:
            logging_only: whether to return error messages instead of raising exceptions

        Returns:
            a list of error messages (only if logging_only is True)
    """
    failure_messages = []
    for model in [
            'abmotifs.hmm', 'dockingdomains.hmm', 'ksdomains.hmm',
            'nrpspksdomains.hmm'
    ]:
        full_path = path.get_full_path(__file__, "data", model)
        failure_messages.extend(
            hmmer.ensure_database_pressed(full_path,
                                          return_not_raise=logging_only))
    return failure_messages
示例#7
0
def check_prereqs(options: ConfigType) -> List[str]:
    """ Ensure at least one database exists and is valid """
    failure_messages = []
    for binary_name in ['hmmscan']:
        if binary_name not in options.executables:
            failure_messages.append(
                f"Failed to locate executable: {binary_name!r}")

    # account for database directories mounted into docker containers
    if "mounted_at_runtime" in options.database_dir:
        return failure_messages

    tigr_db = os.path.join(options.database_dir, "tigrfam", "TIGRFam.hmm")
    if not path.locate_file(tigr_db):
        failure_messages.append(
            f"Failed to locate TIGRFam db in {os.path.join(options.database_dir, 'tigrfam')}"
        )

    failure_messages.extend(
        hmmer.ensure_database_pressed(tigr_db, return_not_raise=True))

    return failure_messages