def run_mmseqs(seqs1, seqs2): """ Equivalent to blast_seqs() but uses mmseqs and thus is much faster :param seqs1: list of sequences to compare :param seqs2: list of sequence to be compared against :return: """ query_fasta = write_seqs_to_file(seqs1) target_fasta = write_seqs_to_file(seqs2) outfile = Path(tempfile.gettempdir()) / ( next(tempfile._get_candidate_names()) + ".dat") tmpdir = tempfile.TemporaryDirectory() # This needs at least mmseqs v8 result = subprocess.run(["mmseqs"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # m = re.search("MMseqs2 Version: ([0-9])\..+", result.stdout.decode('utf-8')) # assert m, "Can't read your mmseqs version, requires at least version 8" # assert int(m.group(1)) >= 8, "Require mmseqs at least version 8" cmd = f"mmseqs easy-search {query_fasta} {target_fasta} {outfile} {tmpdir.name} --threads 1 --split-memory-limit {max_mem_use} --search-type 3" run_subprocess(cmd, get_stdout=True) with open(outfile) as f: mmseqs_output = f.read().rstrip("\n") # I've renamed these for consistency with blast output columns = "qseqid,sseqid,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bitscore".split( ",") return mmseqs_output, columns
def call_muscle(infile: Path) -> Path: """ Produces an aligned version of an input FASTA file (overwrites the original) https://www.drive5.com/muscle/ """ cmd = f"muscle -in {infile} -out {infile} -maxiters 1" run_subprocess(cmd, get_stdout=True) return infile
def call_mmseqs_linclust(database: Path, outdir: Path, n_cpu: int, min_seq_id: float, alignment_mode: int, coverage_length: float) -> Path: """ mmseqs linclust system call """ out_clusterdb = outdir / (database.with_suffix("").name + "_clusterDB") tmp_dir = outdir / "tmp" if tmp_dir.exists(): shutil.rmtree(tmp_dir) tmp_dir.mkdir(parents=True) """ linclust option explanation: --min-seq-id only list matches above this sequence identity+ --alignment-mode 3 0: automatic; 1: only score and end_pos; 2: also start_pos and cov; 3: also seq.id --cov-mode 1 coverage of query and target; coverage is defined with -c option -c 1 set c to 100% by default --sort-results 1 sort by evalue --max-iterations 1000 maximum depth of breadth first search in connected component """ cmd = f"mmseqs linclust {database} {out_clusterdb} {tmp_dir} " \ f"--threads {n_cpu} --min-seq-id {min_seq_id} --add-self-matches --alignment-mode {alignment_mode} " \ f"--cov-mode 1 -c {coverage_length} --sort-results 1" output = run_subprocess(cmd, get_stdout=True) log_subprocess_output(output, logger_instance=mmseqs_log) if tmp_dir.exists(): shutil.rmtree(tmp_dir) return out_clusterdb
def call_mmseqs_createdb(fasta: Path, outdir: Path) -> Path: """ mmseqs createdb system call """ out_db = outdir / (fasta.with_suffix("").name + "_DB") cmd = f"mmseqs createdb {fasta} {out_db}" output = run_subprocess(cmd, get_stdout=True) log_subprocess_output(output, logger_instance=mmseqs_log) return out_db
def call_mmseqs_result2tsv(database: Path, cluster_database: Path, outdir: Path) -> Path: """ mmseqs result2tsv system call """ # Produce .tsv cluster_tsv = outdir / database.with_suffix(".cluster.tsv").name cmd = f"mmseqs createtsv {database} {database} {cluster_database} {cluster_tsv} --first-seq-as-repr true" output = run_subprocess(cmd, get_stdout=True) log_subprocess_output(output, logger_instance=mmseqs_log) return cluster_tsv
def call_prokka(fasta_path: Path, sample_id: str, outdir: Path, n_cpu: int) -> Optional[ProkkaObject]: """ Makes a system call to Prokka, once complete populates a ProkkaObject with relevant data """ cmd = f"prokka --centre CORE --compliant --kingdom Bacteria " \ f"--cpus {n_cpu} --prefix {sample_id} --locustag {sample_id} --outdir {outdir} {fasta_path}" output = run_subprocess(cmd, get_stdout=True) log_subprocess_output(output, logger_instance=prokka_log) # cleanup_prokka(prokka_dir=outdir) # TODO: Turn this on - will remove extraneous Prokka results prokka_object = prokka_obj_from_results_dir(prokka_dir=outdir) return prokka_object
def call_mmseqs_result2repseq(database: Path, cluster_database: Path, outdir: Path, n_cpu: int) -> Path: """ mmseqs result2repseq system call """ # Only grab representative sequence from clusters representative_sequences = outdir / database.with_suffix( ".representative_sequences").name cmd = f"mmseqs result2repseq {database} {cluster_database} {representative_sequences} --threads {n_cpu}" output = run_subprocess(cmd, get_stdout=True) log_subprocess_output(output, logger_instance=mmseqs_log) return representative_sequences
def call_mmseqs_createseqfiledb(database: Path, cluster_database: Path, outdir: Path, min_sequences: int = None, max_sequences: int = None) -> Path: """ mmseqs createseqfiledb system call """ cluster_seq = outdir / (database.with_suffix("").name + "_clusterSEQ") cmd = f"mmseqs createseqfiledb {database} {cluster_database} {cluster_seq} " if min_sequences is not None: cmd += f"--min-sequences {min_sequences} " if max_sequences is not None: cmd += f"--max-sequences {max_sequences} " output = run_subprocess(cmd, get_stdout=True) log_subprocess_output(output, logger_instance=mmseqs_log) return cluster_seq
def call_snp_sites(aligned_multifasta: Path, outdir: Path) -> Path: """ Calls snp-sites on an aligned multiFASTA file and produces a VCF file as output. Will only generate an output file if variants are detected. https://github.com/sanger-pathogens/snp-sites :param aligned_multifasta: Path to multi-FASTA containing alignment of a core gene :param outdir: Path to desired output directory :return: Path to VCF """ outvcf = outdir / aligned_multifasta.with_suffix(".vcf").name cmd = f"snp-sites -v -o {outvcf} {aligned_multifasta}" err = run_subprocess(cmd, get_stdout=True) return outvcf
def call_mmseqs_cluster(database: Path, outdir: Path, n_cpu: int, min_seq_id: float) -> Path: """ mmseqs cluster system call """ out_clusterdb = outdir / (database.with_suffix("").name + "_clusterDB") tmp_dir = outdir / "tmp" if tmp_dir.exists(): shutil.rmtree(tmp_dir) tmp_dir.mkdir(parents=True) cmd = f"mmseqs cluster {database} {out_clusterdb} {tmp_dir} " \ f"--threads {n_cpu} --min-seq-id {min_seq_id} --add-self-matches" output = run_subprocess(cmd, get_stdout=True) log_subprocess_output(output, logger_instance=mmseqs_log) if tmp_dir.exists(): shutil.rmtree(tmp_dir) return out_clusterdb
def call_mmseqs_result2flat(database: Path, outdir: Path, cluster_seqs: Path, representative_sequences: bool = False, use_fasta_header: bool = False) -> Path: """ mmseqs result2flat system call """ # Produce FASTA if representative_sequences: cluster_fasta = outdir / database.with_suffix( ".representative_sequences.faa").name else: cluster_fasta = outdir / database.with_suffix( ".cluster_sequences.faa").name cmd = f"mmseqs result2flat {database} {database} {cluster_seqs} {cluster_fasta} " if use_fasta_header: cmd += "--use-fasta-header" output = run_subprocess(cmd, get_stdout=True) log_subprocess_output(output, logger_instance=mmseqs_log) return cluster_fasta
def call_muscle(infile: Path, outfile: Path = None): if outfile is None: outfile = infile.with_suffix(".align.fasta") cmd = f"muscle -in {infile} -out {outfile} -maxiters 1" run_subprocess(cmd, get_stdout=True)