示例#1
0
def run_mmseqs(seqs1, seqs2):
    """
    Equivalent to blast_seqs() but uses mmseqs and thus is much faster
    :param seqs1: list of sequences to compare
    :param seqs2: list of sequence to be compared against
    :return:
    """
    query_fasta = write_seqs_to_file(seqs1)
    target_fasta = write_seqs_to_file(seqs2)

    outfile = Path(tempfile.gettempdir()) / (
        next(tempfile._get_candidate_names()) + ".dat")
    tmpdir = tempfile.TemporaryDirectory()

    # This needs at least mmseqs v8
    result = subprocess.run(["mmseqs"],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    # m = re.search("MMseqs2 Version: ([0-9])\..+", result.stdout.decode('utf-8'))
    # assert m, "Can't read your mmseqs version, requires at least version 8"
    # assert int(m.group(1)) >= 8, "Require mmseqs at least version 8"

    cmd = f"mmseqs easy-search {query_fasta} {target_fasta} {outfile} {tmpdir.name} --threads 1 --split-memory-limit {max_mem_use} --search-type 3"
    run_subprocess(cmd, get_stdout=True)

    with open(outfile) as f:
        mmseqs_output = f.read().rstrip("\n")

    # I've renamed these for consistency with blast output
    columns = "qseqid,sseqid,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bitscore".split(
        ",")
    return mmseqs_output, columns
示例#2
0
def call_muscle(infile: Path) -> Path:
    """
    Produces an aligned version of an input FASTA file (overwrites the original)

    https://www.drive5.com/muscle/
    """
    cmd = f"muscle -in {infile} -out {infile} -maxiters 1"
    run_subprocess(cmd, get_stdout=True)
    return infile
def call_mmseqs_linclust(database: Path, outdir: Path, n_cpu: int,
                         min_seq_id: float, alignment_mode: int,
                         coverage_length: float) -> Path:
    """ mmseqs linclust system call """
    out_clusterdb = outdir / (database.with_suffix("").name + "_clusterDB")
    tmp_dir = outdir / "tmp"
    if tmp_dir.exists():
        shutil.rmtree(tmp_dir)
    tmp_dir.mkdir(parents=True)
    """
    linclust option explanation:
    --min-seq-id            only list matches above this sequence identity+
    --alignment-mode 3      0: automatic; 1: only score and end_pos; 2: also start_pos and cov; 3: also seq.id
    --cov-mode 1            coverage of query and target; coverage is defined with -c option
    -c 1                    set c to 100% by default
    --sort-results 1        sort by evalue
    --max-iterations 1000   maximum depth of breadth first search in connected component
    """

    cmd = f"mmseqs linclust {database} {out_clusterdb} {tmp_dir} " \
          f"--threads {n_cpu} --min-seq-id {min_seq_id} --add-self-matches --alignment-mode {alignment_mode} " \
          f"--cov-mode 1 -c {coverage_length} --sort-results 1"

    output = run_subprocess(cmd, get_stdout=True)
    log_subprocess_output(output, logger_instance=mmseqs_log)

    if tmp_dir.exists():
        shutil.rmtree(tmp_dir)

    return out_clusterdb
def call_mmseqs_createdb(fasta: Path, outdir: Path) -> Path:
    """ mmseqs createdb system call """
    out_db = outdir / (fasta.with_suffix("").name + "_DB")
    cmd = f"mmseqs createdb {fasta} {out_db}"
    output = run_subprocess(cmd, get_stdout=True)
    log_subprocess_output(output, logger_instance=mmseqs_log)
    return out_db
def call_mmseqs_result2tsv(database: Path, cluster_database: Path,
                           outdir: Path) -> Path:
    """ mmseqs result2tsv system call """

    # Produce .tsv
    cluster_tsv = outdir / database.with_suffix(".cluster.tsv").name
    cmd = f"mmseqs createtsv {database} {database} {cluster_database} {cluster_tsv} --first-seq-as-repr true"
    output = run_subprocess(cmd, get_stdout=True)
    log_subprocess_output(output, logger_instance=mmseqs_log)
    return cluster_tsv
示例#6
0
def call_prokka(fasta_path: Path, sample_id: str, outdir: Path,
                n_cpu: int) -> Optional[ProkkaObject]:
    """ Makes a system call to Prokka, once complete populates a ProkkaObject with relevant data """
    cmd = f"prokka --centre CORE --compliant --kingdom Bacteria " \
          f"--cpus {n_cpu} --prefix {sample_id} --locustag {sample_id} --outdir {outdir} {fasta_path}"
    output = run_subprocess(cmd, get_stdout=True)
    log_subprocess_output(output, logger_instance=prokka_log)
    # cleanup_prokka(prokka_dir=outdir)  # TODO: Turn this on - will remove extraneous Prokka results
    prokka_object = prokka_obj_from_results_dir(prokka_dir=outdir)
    return prokka_object
def call_mmseqs_result2repseq(database: Path, cluster_database: Path,
                              outdir: Path, n_cpu: int) -> Path:
    """ mmseqs result2repseq system call """

    # Only grab representative sequence from clusters
    representative_sequences = outdir / database.with_suffix(
        ".representative_sequences").name
    cmd = f"mmseqs result2repseq {database} {cluster_database} {representative_sequences} --threads {n_cpu}"
    output = run_subprocess(cmd, get_stdout=True)
    log_subprocess_output(output, logger_instance=mmseqs_log)
    return representative_sequences
def call_mmseqs_createseqfiledb(database: Path,
                                cluster_database: Path,
                                outdir: Path,
                                min_sequences: int = None,
                                max_sequences: int = None) -> Path:
    """ mmseqs createseqfiledb system call """
    cluster_seq = outdir / (database.with_suffix("").name + "_clusterSEQ")
    cmd = f"mmseqs createseqfiledb {database} {cluster_database} {cluster_seq} "
    if min_sequences is not None:
        cmd += f"--min-sequences {min_sequences} "
    if max_sequences is not None:
        cmd += f"--max-sequences {max_sequences} "
    output = run_subprocess(cmd, get_stdout=True)
    log_subprocess_output(output, logger_instance=mmseqs_log)
    return cluster_seq
示例#9
0
def call_snp_sites(aligned_multifasta: Path, outdir: Path) -> Path:
    """
    Calls snp-sites on an aligned multiFASTA file and produces a VCF file as output.
    Will only generate an output file if variants are detected.

    https://github.com/sanger-pathogens/snp-sites

    :param aligned_multifasta: Path to multi-FASTA containing alignment of a core gene
    :param outdir: Path to desired output directory
    :return: Path to VCF
    """
    outvcf = outdir / aligned_multifasta.with_suffix(".vcf").name
    cmd = f"snp-sites -v -o {outvcf} {aligned_multifasta}"
    err = run_subprocess(cmd, get_stdout=True)
    return outvcf
def call_mmseqs_cluster(database: Path, outdir: Path, n_cpu: int,
                        min_seq_id: float) -> Path:
    """ mmseqs cluster system call """
    out_clusterdb = outdir / (database.with_suffix("").name + "_clusterDB")
    tmp_dir = outdir / "tmp"
    if tmp_dir.exists():
        shutil.rmtree(tmp_dir)
    tmp_dir.mkdir(parents=True)
    cmd = f"mmseqs cluster {database} {out_clusterdb} {tmp_dir} " \
          f"--threads {n_cpu} --min-seq-id {min_seq_id} --add-self-matches"
    output = run_subprocess(cmd, get_stdout=True)
    log_subprocess_output(output, logger_instance=mmseqs_log)

    if tmp_dir.exists():
        shutil.rmtree(tmp_dir)

    return out_clusterdb
def call_mmseqs_result2flat(database: Path,
                            outdir: Path,
                            cluster_seqs: Path,
                            representative_sequences: bool = False,
                            use_fasta_header: bool = False) -> Path:
    """ mmseqs result2flat system call """

    # Produce FASTA
    if representative_sequences:
        cluster_fasta = outdir / database.with_suffix(
            ".representative_sequences.faa").name
    else:
        cluster_fasta = outdir / database.with_suffix(
            ".cluster_sequences.faa").name

    cmd = f"mmseqs result2flat {database} {database} {cluster_seqs} {cluster_fasta} "
    if use_fasta_header:
        cmd += "--use-fasta-header"
    output = run_subprocess(cmd, get_stdout=True)
    log_subprocess_output(output, logger_instance=mmseqs_log)
    return cluster_fasta
示例#12
0
def call_muscle(infile: Path, outfile: Path = None):
    if outfile is None:
        outfile = infile.with_suffix(".align.fasta")
    cmd = f"muscle -in {infile} -out {outfile} -maxiters 1"
    run_subprocess(cmd, get_stdout=True)