def cblaster( query_file=None, query_ids=None, mode=None, json_db=None, database=None, gap=20000, unique=3, min_hits=3, min_identity=30, min_coverage=50, max_evalue=0.01, entrez_query=None, output=None, output_hide_headers=False, output_delimiter=None, output_decimals=4, binary=None, binary_hide_headers=True, binary_delimiter=None, binary_key=len, binary_attr="identity", binary_decimals=4, rid=None, require=None, session_file=None, indent=None, plot=False, recompute=False, blast_file=None, ipg_file=None, hitlist_size=None, ): """Run cblaster. This function is the central workflow for the entire cblaster package. Arguments: query_file (str): Path to FASTA format query file query_ids (list): NCBI protein sequence identifiers mode (str): Search mode ('local' or 'remote') json_db (str): JSON database created with cblaster makedb database (str): Search database (NCBI if remote, DIAMOND if local) gap (int): Maximum gap (kilobase) between cluster hits unique (int): Minimum number of query sequences with hits in clusters min_hits (int): Minimum number of hits in clusters min_identity (float): Minumum identity (%) cutoff min_coverage (float): Minumum coverage (%) cutoff max_evalue (float): Maximum e-value threshold entrez_query (str): NCBI Entrez query to filter search database output (str): Path to cblaster summary output file output_hide_headers (bool): Hide headers in summary table output_delimiter (str): Delimiter used in summary table output_decimals (int): Total decimal places in hit scores in summary table binary (str): Path to cblaster binary output file binary_hide_headers (bool): Hide headers in binary table binary_delimiter (str): Delimiter used in binary table binary_key (str): Key function used in binary table (len, max or sum) binary_attr (str): Hit attribute used for calculating cell values in binary table binary_decimals (int): Total decimal places in cell values in binary table rid (str): NCBI BLAST search request identifier (RID) require (list): Query sequences that must be in hit clusters session_file (str): Path to cblaster session JSON file indent (int): Total spaces to indent JSON files plot (str): Path to cblaster plot HTML file recompute (str): Path to recomputed session JSON file Returns: Session: cblaster search Session object """ if session_file and all(Path(sf).exists() for sf in session_file): LOG.info("Loading session(s) %s", session_file) session = Session.from_files(session_file) if recompute: LOG.info("Filtering session with new thresholds") context.filter_session( session, min_identity, min_coverage, max_evalue, gap, unique, min_hits, require, ) if recompute is not True: LOG.info("Writing recomputed session to %s", recompute) with open(recompute, "w") as fp: session.to_json(fp, indent=indent) else: session = Session( queries=query_ids if query_ids else [], sequences=helpers.get_sequences( query_file=query_file, query_ids=query_ids, ), params={ "mode": mode, "database": database, "min_identity": min_identity, "min_coverage": min_coverage, "max_evalue": max_evalue, }, ) if query_file: # get_sequences() returns OrderedDict, so save keys to # preserve query order session.queries = list(session.sequences) session.params["query_file"] = query_file if json_db: session.params["json_db"] = json_db if mode == "local": LOG.info("Starting cblaster in local mode") results = local.search( database, sequences=session.sequences, min_identity=min_identity, min_coverage=min_coverage, max_evalue=max_evalue, blast_file=blast_file, ) elif mode == "remote": LOG.info("Starting cblaster in remote mode") if entrez_query: session.params["entrez_query"] = entrez_query rid, results = remote.search( sequences=session.sequences, rid=rid, database=database, min_identity=min_identity, min_coverage=min_coverage, max_evalue=max_evalue, entrez_query=entrez_query, blast_file=blast_file, hitlist_size=hitlist_size, ) session.params["rid"] = rid LOG.info("Found %i hits meeting score thresholds", len(results)) LOG.info("Fetching genomic context of hits") query_sequence_order = list(session.sequences.keys()) \ if any(query_file.endswith(ext) for ext in (".gbk", ".gb", ".genbank", ".gbff", ".embl", ".emb"))\ else None session.organisms = context.search( results, unique=unique, min_hits=min_hits, gap=gap, require=require, json_db=json_db, ipg_file=ipg_file, query_sequence_order=query_sequence_order) if session_file: LOG.info("Writing current search session to %s", session_file[0]) if len(session_file) > 1: LOG.warning("Multiple session files specified, using first") with open(session_file[0], "w") as fp: session.to_json(fp, indent=indent) if binary: LOG.info("Writing binary summary table to %s", binary) session.format( "binary", open(binary, "w"), hide_headers=binary_hide_headers, delimiter=binary_delimiter, key=binary_key, attr=binary_attr, decimals=binary_decimals, ) LOG.info("Writing summary to %s", "stdout" if output == sys.stdout else output) results = session.format( "summary", fp=open(output, "w") if output else sys.stdout, hide_headers=output_hide_headers, delimiter=output_delimiter, decimals=output_decimals, ) if plot: plot = None if plot is True else plot plot_session(session, output=plot) LOG.info("Done.") return session
def cblaster( query_file=None, query_ids=None, query_profiles=None, mode=None, databases=None, database_pfam=None, gap=20000, unique=3, min_hits=3, min_identity=30, min_coverage=50, max_evalue=0.01, percentage=None, entrez_query=None, output=None, output_hide_headers=False, output_delimiter=None, output_decimals=4, output_sort_clusters=False, binary=None, binary_hide_headers=True, binary_delimiter=None, binary_key=len, binary_attr="identity", binary_decimals=4, rid=None, require=None, session_file=None, indent=None, plot=False, max_plot_clusters=50, recompute=False, blast_file=None, ipg_file=None, hitlist_size=None, cpus=None, intermediate_genes=False, intermediate_gene_distance=5000, intermediate_max_clusters=100, testing=False, ): """Run cblaster. This function is the central workflow for the entire cblaster package. Arguments: query_file (str): Path to FASTA format query file query_ids (list): NCBI protein sequence identifiers query_profiles(list): Pfam profile identifiers mode (str): Search mode ('local' or 'remote') databases (str): Search database (NCBI if remote, DIAMOND if local) database_pfam (str): Path to pfam db or where to download it gap (int): Maximum gap (kilobase) between cluster hits unique (int): Minimum number of query sequences with hits in clusters min_hits (int): Minimum number of hits in clusters min_identity (float): Minumum identity (%) cutoff min_coverage (float): Minumum coverage (%) cutoff max_evalue (float): Maximum e-value threshold percentage (int): % of query genes needed to be present in cluster entrez_query (str): NCBI Entrez query to filter search database output (str): Path to cblaster summary output file output_hide_headers (bool): Hide headers in summary table output_delimiter (str): Delimiter used in summary table output_decimals (int): Total decimal places in hit scores in summary table output_sort_clusters (bool): If the clusters in the final summary table need to sorted binary (str): Path to cblaster binary output file binary_hide_headers (bool): Hide headers in binary table binary_delimiter (str): Delimiter used in binary table binary_key (str): Key function used in binary table (len, max or sum) binary_attr (str): Hit attribute used for calculating cell values in binary table binary_decimals (int): Total decimal places in cell values in binary table rid (str): NCBI BLAST search request identifier (RID) require (list): Query sequences that must be in hit clusters session_file (str): Path to cblaster session JSON file indent (int): Total spaces to indent JSON files plot (str): Path to cblaster plot HTML file max_plot_clusters (int): maximum clusters that are plotted when -osc (sort on score ) argument is used recompute (str): Path to recomputed session JSON file blast_file (str): path to file to save blast output ipg_file (str): path to file to save ipg output cpus (int): number of cpu's to use when blasting. intermediate_genes (bool): Signifies if intermediate genes have to be shown hitlist_size (int): Number of database sequences to keep intermediate_gene_distance (int): the maximum allowed distance between the edge of a cluster and an intermediate gene. intermediate_max_clusters (int): the maximum amount of clusters for which intermediate genes will be fetched, since this can become expensive for remote searches testing (bool): flag to make sure certain code does not run when testing Returns: Session: cblaster search Session object """ if session_file and all(Path(sf).exists() for sf in session_file): LOG.info("Loading session(s) %s", session_file) session = Session.from_files(session_file) if recompute: LOG.info("Filtering session with new thresholds") context.filter_session( session, min_identity, min_coverage, max_evalue, gap, unique, min_hits, require, percentage, ) if intermediate_genes: find_intermediate_genes(session, intermediate_gene_distance, intermediate_max_clusters) if recompute is not True: LOG.info("Writing recomputed session to %s", recompute) session.params["min_identity"] = min_identity session.params["min_coverage"] = min_coverage session.params["max_evalue"] = max_evalue session.params["require"] = require with open(recompute, "w") as fp: session.to_json(fp, indent=indent) else: # Create a cblaster Cluster object from query input query = helpers.parse_query_sequences( query_file=query_file, query_ids=query_ids, query_profiles=query_profiles, ) # Create a cblaster Session session = Session( query=query, queries=query.names, params={ "mode": mode, "database": databases, "min_identity": min_identity, "min_coverage": min_coverage, "max_evalue": max_evalue, "require": require, }, ) if query_file: # get_sequences() returns OrderedDict, so save keys to # preserve query order session.params["query_file"] = query_file sqlite_db = None session.params["rid"] = rid if "combi" in mode and not len(databases) == 2: raise RuntimeError("Expected two databases for 'combi_' modes") if mode in ("hmm", "combi_local", "combi_remote"): sqlite_db = helpers.find_sqlite_db(databases[0]) results = hmm_search.perform_hmmer(fasta=databases[0], query_profiles=query_profiles, pfam=database_pfam, session=session) # Delete first (FASTA) database when doing combined searches # Expect .dmnd/NCBI database name for local/remote, respectively if "combi" in mode: del databases[0] LOG.info("Found %i hits meeting score thresholds for hmm search", len(results)) LOG.info("Fetching genomic context of hits") organisms = context.search( results, sqlite_db=sqlite_db, unique=unique, min_hits=min_hits, gap=gap, require=require, ipg_file=ipg_file, query_sequence_order=session.queries, percentage=percentage, ) session.organisms.extend(organisms) # When running combined modes, run local/remote search right after HMM search if mode == "combi_local": mode = "local" elif mode == "combi_remote": mode = "remote" if mode == "local": LOG.info("Starting cblaster in local mode") sqlite_db = helpers.find_sqlite_db(databases[0]) results = local.search( databases[0], sequences=session.query.sequences, min_identity=min_identity, min_coverage=min_coverage, max_evalue=max_evalue, blast_file=blast_file, cpus=cpus, ) LOG.info("Found %i hits meeting score thresholds for local search", len(results)) LOG.info("Fetching genomic context of hits") organisms = context.search( results, sqlite_db=sqlite_db, unique=unique, min_hits=min_hits, gap=gap, require=require, ipg_file=ipg_file, query_sequence_order=session.queries, percentage=percentage, ) session.organisms.extend(organisms) elif mode == "remote": LOG.info("Starting cblaster in remote mode") if entrez_query: session.params["entrez_query"] = entrez_query rid, results = remote.search( sequences=session.query.sequences, rid=rid, database=databases[0], min_identity=min_identity, min_coverage=min_coverage, max_evalue=max_evalue, entrez_query=entrez_query, blast_file=blast_file, hitlist_size=hitlist_size, ) session.params["rid"] = rid LOG.info( "Found %i hits meeting score thresholds for remote search", len(results)) LOG.info("Fetching genomic context of hits") organisms = context.search( results, unique=unique, min_hits=min_hits, gap=gap, require=require, ipg_file=ipg_file, query_sequence_order=session.queries, percentage=percentage, ) session.organisms.extend(organisms) if sqlite_db: session.params["sqlite_db"] = str(sqlite_db) if intermediate_genes: find_intermediate_genes(session, intermediate_gene_distance, intermediate_max_clusters) if session_file: LOG.info("Writing current search session to %s", session_file[0]) if len(session_file) > 1: LOG.warning("Multiple session files specified, using first") with open(session_file[0], "w") as fp: session.to_json(fp, indent=indent) if binary: LOG.info("Writing binary summary table to %s", binary) session.format( "binary", open(binary, "w"), hide_headers=binary_hide_headers, delimiter=binary_delimiter, key=binary_key, attr=binary_attr, decimals=binary_decimals, sort_clusters=output_sort_clusters, ) LOG.info("Writing summary to %s", "stdout" if output is None else output) session.format( "summary", fp=open(output, "w") if output else sys.stdout, hide_headers=output_hide_headers, delimiter=output_delimiter, decimals=output_decimals, sort_clusters=output_sort_clusters, ) if plot: plot = None if plot is True else plot plot_session( session, output=plot, sort_clusters=output_sort_clusters, max_clusters=max_plot_clusters, testing=testing, ) LOG.info("Done.") return session