def mafft(fasta_file, output_dir, output_ext, seq_type, cpus, anysymbol):
    """Align sequences."""
    in_path = fasta_file
    if seq_type == 'aa':
        in_path = bio.adjust_aa_seqs(fasta_file, output_dir)

    cmd = [
        'mafft', '--amino' if seq_type == 'aa' else '--nuc',
        '--thread {}'.format(cpus), '--anysymbol' if anysymbol else ''
    ]

    if (bio.fasta_record_count(in_path) >= bio.SEQ_COUNT_CUTOFF
            or bio.longest_fasta_seq(in_path) >= bio.SEQ_LEN_CUTOFF):
        cmd.append('--auto')
    else:
        cmd += [
            '--genafpair', '--maxiterate {}'.format(MAX_ITERATE),
            '--anysymbol' if anysymbol else ''
        ]

    cmd.append(in_path)
    cmd = ' '.join(cmd)

    aligned = util.file_name(fasta_file, output_ext)

    with util.cd(output_dir):
        result = subprocess.check_output(cmd, shell=True)
        with open(aligned, 'wb') as out_file:
            out_file.write(result)

    return aligned
示例#2
0
def treeshrink(tree_file, output_dir, output_ext, quantiles):
    """Remove long branches from a tree."""
    subdir = util.file_name(tree_file)

    cmd = ' '.join([
        'run_treeshrink.py',
        '--tree {}'.format(tree_file),
        '--centroid',
        '--mode per-gene',
        '--quantiles {}'.format(quantiles),
        '--outdir {}'.format(subdir),
        '--tempdir {}'.format(subdir)])

    with util.cd(output_dir):
        subprocess.check_call(cmd, shell=True)

        mask = util.file_name(subdir + '_*', ext=EXT_IN, dir_=subdir)
        tree_src = glob(mask)[0]
        tree_dst = util.file_name(tree_file, output_ext + EXT_OUT)

        with open(tree_src) as in_file, open(tree_dst, 'w') as out_file:
            content = in_file.read()
            out_file.write(content.replace("'", ''))

        rmtree(subdir)

    return tree_dst
示例#3
0
def raxml_ng_bs(fasta_file,
                output_dir,
                temp_dir,
                seq_type,
                cpus,
                seed,
                output_ext,
                replicates=100):
    """Build a bootstrapped tree with raxml."""
    model = "Blosum62" if seq_type == "aa" else "GTR"
    tree = util.file_name(fasta_file, output_ext, output_dir)
    cmd = ' '.join([
        'raxml-ng', '-T {}'.format(cpus), '-f a', '-x {}'.format(seed),
        '-p {}'.format(seed), '-m {}'.format(model),
        '-# {}'.format(replicates), '-s {}'.format(fasta_file),
        '-n {}'.format(tree)
    ])

    with util.cd(temp_dir):
        subprocess.check_call(cmd, shell=True)

        tree_src = join('RAxML_bipartitions.' + tree)
        tree_dst = join(output_dir, tree)
        move(tree_src, tree_dst)

    return tree_dst
示例#4
0
def mask_tips(tree_file, output_dir, output_ext):
    """Wrap tree tip removal."""
    tree = Phylo.read(tree_file, 'newick')

    mask_monophyletic_tips(tree)

    output = util.file_name(tree_file, output_ext)
    with util.cd(output_dir):
        Phylo.write(tree, output, 'newick')

    return output
示例#5
0
def pxrr(tree_file, output_dir):
    """Unroot the tree returned by treeshrink."""
    unrooted = util.file_name(tree_file)
    cmd = ' '.join([
        'pxrr', '--unroot', '--treef {}'.format(tree_file),
        '--outf {}'.format(unrooted)
    ])

    with util.cd(output_dir):
        subprocess.check_call(cmd, shell=True)
        util.remove_files('phyx.logfile')

    return unrooted
示例#6
0
def cut_branches(tree_file, output_dir, output_ext, branch_cutoff, min_taxa):
    """Cut long internal branches."""
    tree = Phylo.read(tree_file, 'newick')

    subtrees = cut_deep(tree, branch_cutoff, min_taxa)

    with util.cd(output_dir):
        for i, subtree in enumerate(subtrees):
            output = '{}_{}'.format(tree_file, i)
            output = util.file_name(output, output_ext)
            Phylo.write(subtree, output, 'newick')

    return output
示例#7
0
def fasttree(fasta_file, output_dir, output_ext, seq_type):
    """Build a tree with fasttree."""
    cmd = ['fasttree', '-quiet']
    cmd += ['-wag'] if seq_type == 'aa' else ['-nt', '-gtr']
    cmd.append(fasta_file)
    cmd = ' '.join(cmd)

    tree_file = util.file_name(fasta_file, output_ext)

    with util.cd(output_dir):
        result = subprocess.check_output(cmd, shell=True)
        with open(tree_file, 'wb') as out_file:
            out_file.write(result)

    return tree_file
def raxml(fasta_file, output_dir, output_ext, seq_type, cpus, seed):
    """Build a tree with raxml."""
    model = "PROTCATWAG" if seq_type == "aa" else "GTRCAT"
    tree = util.file_name(fasta_file, output_ext)
    cmd = ' '.join([
        'raxml', '-T {}'.format(cpus), '-p {}'.format(seed),
        '-m {}'.format(model), '-s {}'.format(fasta_file), '-n {}'.format(tree)
    ])

    with util.cd(output_dir):
        subprocess.check_call(cmd, shell=True)
        tree_src = 'RAxML_bestTree.' + tree
        move(tree_src, tree)
        util.remove_files('RAxML_*')

    return tree
示例#9
0
def raxml_ng(fasta_file, output_dir, temp_dir, seq_type, cpus, seed,
             output_ext):
    """Build a tree with raxml."""
    model = "Blosum62" if seq_type == "aa" else "GTR"
    tree = util.file_name(fasta_file, output_ext)
    cmd = ' '.join([
        'raxml-ng', '-T {}'.format(cpus), '-p {}'.format(seed),
        '-m {}'.format(model), '-s {}'.format(fasta_file), '-n {}'.format(tree)
    ])

    with util.cd(temp_dir):
        subprocess.check_call(cmd, shell=True)

        tree_src = join('RAxML_bestTree.' + tree)
        tree_dst = join(output_dir, tree)
        move(tree_src, tree_dst)

    return tree_dst
示例#10
0
def prank(fasta_file, output_dir, temp_dir, seq_type):
    """Align sequences."""
    in_path = fasta_file
    if seq_type == 'aa':
        in_path = bio.adjust_aa_seqs(fasta_file, temp_dir)

    aligned = util.file_name(fasta_file, 'ortho.aln')

    cmd = [
        'prank',
        '-d {}'.format(in_path),
        '-o {}'.format(aligned),
        '-protein' if seq_type == 'aa' else '-DNA',
    ]

    cmd = ' '.join(cmd)

    with util.cd(temp_dir):
        result = subprocess.check_output(cmd)
        with open(aligned, 'wb') as out_file:
            out_file.write(result)

    return aligned
def pasta(fasta_file, output_dir, output_ext, seq_type, cpus):
    """Align sequences."""
    in_path = fasta_file
    if seq_type == 'aa':
        in_path = bio.adjust_aa_seqs(fasta_file, output_dir)

    cmd = ' '.join([
        which('run_pasta.py'),
        '--datatype {}'.format('Protein' if seq_type == 'aa' else 'DNA'),
        '--num-cpus {}'.format(cpus), "--input '{}'".format(in_path),
        "--output-directory '{}'".format(abspath(output_dir))
    ])

    with util.cd(output_dir):
        subprocess.check_call(cmd, shell=True)

        base_name = splitext(basename(fasta_file))[0]
        temp_aligned = 'pastajob.marker001.' + base_name + EXT
        aligned = base_name + output_ext
        move(temp_aligned, aligned)

        util.remove_files('pastajob*')

    return aligned
示例#12
0
def pxclsq(fasta_file, output_dir, output_ext, seq_type, min_occupancy,
           min_len):
    """Filter aligned sequences for occupancy and length."""
    ext = output_ext + EXT_PXCLSQ
    temp_cleaned = util.file_name(fasta_file, ext)

    cmd = ' '.join([
        'pxclsq', '--aminoacid' if seq_type == 'aa' else '',
        '--prop {}'.format(min_occupancy), '--seqf {}'.format(fasta_file),
        '--outf {}'.format(basename(temp_cleaned))
    ])

    cleaned = util.file_name(fasta_file, output_ext)

    with util.cd(output_dir):
        subprocess.check_call(cmd, shell=True)
        with open(temp_cleaned) as in_file, open(cleaned, 'w') as out_file:
            for header, seq in SimpleFastaParser(in_file):
                if len(seq.replace('-', '')) >= min_len:
                    bio.write_fasta_record(out_file, header, seq)

        util.remove_files('phyx.logfile')

    return cleaned
def raxml_bs(fasta_file,
             output_dir,
             output_ext,
             seq_type,
             cpus,
             seed,
             replicates=100):
    """Build a bootstrapped tree with raxml."""
    model = "PROTCATWAG" if seq_type == "aa" else "GTRCAT"
    tree = util.file_name(fasta_file, output_ext)
    cmd = ' '.join([
        'raxml', '-T {}'.format(cpus), '-f a', '-x {}'.format(seed),
        '-p {}'.format(seed), '-m {}'.format(model),
        '-# {}'.format(replicates), '-s {}'.format(fasta_file),
        '-n {}'.format(tree)
    ])

    with util.cd(output_dir):
        subprocess.check_call(cmd, shell=True)
        tree_src = 'RAxML_bipartitions.' + tree
        move(tree_src, tree)
        util.remove_files('RAxML_*')

    return tree