def download_resfam(db_dir: str) -> None: """Download and sanitise the Resfam database.""" archive_filename = os.path.join(db_dir, "resfam", "Resfams.hmm.gz") filename = os.path.splitext(archive_filename)[0] url = RESFAM_URL # checksum of existing not matched because it has a convert timestamp in it # So check size and line count as an approximation if present_and_size_matches(filename, RESFAM_SIZE) and \ present_and_line_count_matches(filename, RESFAM_LINES): print("Resfams database present and checked") return print("Downloading Resfam database") check_diskspace(url) download_if_not_present(url, archive_filename, RESFAM_ARCHIVE_CHECKSUM) filename = unzip_file(archive_filename, gzip, gzip.zlib.error) # type: ignore delete_file(filename + ".gz") # remove tabs converted = execute(["hmmconvert", filename]) print("Ensuring all cutoffs are present") # add TC to those entries missing them # calculated as 10% less than the minimum scoring hit in their own group missing_cutoffs = { "RF0174": int(374 * 0.9), "RF0172": int(85 * 0.9), "RF0173": int(295 * 0.9), "RF0168": int(691 * 0.9), } with open(filename, "w") as handle: lines = list(converted.stdout) i = 0 while i < len(lines): # find an accession while i < len(lines) and not lines[i].startswith("ACC"): handle.write(lines[i]) i += 1 # end of file with no new accession if i >= len(lines): break # write the accession line itself handle.write(lines[i]) # add the cutoffs if missing acc = lines[i].split()[1] if acc not in missing_cutoffs: continue value = missing_cutoffs[acc] # an accession of interest, so add cutoffs in the same place as others while not lines[i].startswith("CKSUM"): handle.write(lines[i]) i += 1 # write the CKSUM line handle.write(lines[i]) # and finally add the cutoffs for cutoff in ["GA", "TC", "NC"]: handle.write("%s %d.00 %d.00\n" % (cutoff, value, value)) i += 1 ensure_database_pressed(filename)
def download_pfam(db_dir: str, url: str, version: str, archive_checksum: str, db_checksum: str) -> None: """Download and compile the PFAM database.""" archive_filename = os.path.join(db_dir, "pfam", version, "Pfam-A.hmm.gz") db_filename = os.path.splitext(archive_filename)[0] if present_and_checksum_matches(db_filename, db_checksum): print("PFAM file present and ok for version", version) return print("Downloading PFAM version", version) check_diskspace(url) download_if_not_present(url, archive_filename, archive_checksum) filename = unzip_file(archive_filename, gzip, gzip.zlib.error) # type: ignore ensure_database_pressed(filename) delete_file(filename + ".gz")
def download_tigrfam(db_dir: str) -> None: """Download the TIGRFam database.""" archive_filename = os.path.join(db_dir, "tigrfam", "TIGRFam.hmm.gz") filename = os.path.splitext(archive_filename)[0] if present_and_checksum_matches(filename, TIGRFAM_CHECKSUM): print("TIGRFam database present and checked") else: print("Downloading TIGRFam database") check_diskspace(TIGRFAM_URL) download_if_not_present(TIGRFAM_URL, archive_filename, TIGRFAM_ARCHIVE_CHECKSUM) filename = unzip_file(archive_filename, gzip, gzip.zlib.error) # type: ignore delete_file(archive_filename) ensure_database_pressed(filename)
def prepare_data(logging_only: bool = False) -> List[str]: """ Ensures packaged data is fully prepared Arguments: logging_only: whether to return error messages instead of raising exceptions Returns: a list of error messages (only if logging_only is True) """ failure_messages = [] # Check that hmmdetails.txt is readable and well-formatted try: profiles = get_signature_profiles() except ValueError as err: if not logging_only: raise return [str(err)] # the path to the markov model seeds_hmm = path.get_full_path(__file__, 'data', 'bgc_seeds.hmm') hmm_files = [os.path.join("data", sig.hmm_file) for sig in profiles] outdated = False if not path.locate_file(seeds_hmm): logging.debug("%s: %s doesn't exist, regenerating", NAME, seeds_hmm) outdated = True else: seeds_timestamp = os.path.getmtime(seeds_hmm) for component in hmm_files: if os.path.getmtime(component) > seeds_timestamp: logging.debug("%s out of date, regenerating", seeds_hmm) outdated = True break # regenerate if missing or out of date if outdated: # try to generate file from all specified profiles in hmmdetails try: with open(seeds_hmm, 'w') as all_hmms_handle: for hmm_file in hmm_files: with open(path.get_full_path(__file__, hmm_file), 'r') as handle: all_hmms_handle.write(handle.read()) except OSError: if not logging_only: raise failure_messages.append( 'Failed to generate file {!r}'.format(seeds_hmm)) # if regeneration failed, don't try to run hmmpress if failure_messages: return failure_messages failure_messages.extend( hmmer.ensure_database_pressed(seeds_hmm, return_not_raise=logging_only)) return failure_messages
def prepare_data(logging_only: bool = False) -> List[str]: """ Ensures packaged data is fully prepared Arguments: logging_only: whether to return error messages instead of raising exceptions Returns: a list of error messages (only if logging_only is True) """ database = path.get_full_path(__file__, 'data', 'RREFam.hmm') return ensure_database_pressed(database, return_not_raise=logging_only)
def prepare_data(logging_only: bool = False) -> List[str]: """ Ensures packaged data is fully prepared Arguments: logging_only: whether to return error messages instead of raising exceptions Returns: a list of error messages (only if logging_only is True) """ failure_messages = [] for model in [ 'abmotifs.hmm', 'dockingdomains.hmm', 'ksdomains.hmm', 'nrpspksdomains.hmm' ]: full_path = path.get_full_path(__file__, "data", model) failure_messages.extend( hmmer.ensure_database_pressed(full_path, return_not_raise=logging_only)) return failure_messages
def check_prereqs(options: ConfigType) -> List[str]: """ Ensure at least one database exists and is valid """ failure_messages = [] for binary_name in ['hmmscan']: if binary_name not in options.executables: failure_messages.append( f"Failed to locate executable: {binary_name!r}") # account for database directories mounted into docker containers if "mounted_at_runtime" in options.database_dir: return failure_messages tigr_db = os.path.join(options.database_dir, "tigrfam", "TIGRFam.hmm") if not path.locate_file(tigr_db): failure_messages.append( f"Failed to locate TIGRFam db in {os.path.join(options.database_dir, 'tigrfam')}" ) failure_messages.extend( hmmer.ensure_database_pressed(tigr_db, return_not_raise=True)) return failure_messages