def mafft(fasta_file, output_dir, output_ext, seq_type, cpus, anysymbol): """Align sequences.""" in_path = fasta_file if seq_type == 'aa': in_path = bio.adjust_aa_seqs(fasta_file, output_dir) cmd = [ 'mafft', '--amino' if seq_type == 'aa' else '--nuc', '--thread {}'.format(cpus), '--anysymbol' if anysymbol else '' ] if (bio.fasta_record_count(in_path) >= bio.SEQ_COUNT_CUTOFF or bio.longest_fasta_seq(in_path) >= bio.SEQ_LEN_CUTOFF): cmd.append('--auto') else: cmd += [ '--genafpair', '--maxiterate {}'.format(MAX_ITERATE), '--anysymbol' if anysymbol else '' ] cmd.append(in_path) cmd = ' '.join(cmd) aligned = util.file_name(fasta_file, output_ext) with util.cd(output_dir): result = subprocess.check_output(cmd, shell=True) with open(aligned, 'wb') as out_file: out_file.write(result) return aligned
def treeshrink(tree_file, output_dir, output_ext, quantiles): """Remove long branches from a tree.""" subdir = util.file_name(tree_file) cmd = ' '.join([ 'run_treeshrink.py', '--tree {}'.format(tree_file), '--centroid', '--mode per-gene', '--quantiles {}'.format(quantiles), '--outdir {}'.format(subdir), '--tempdir {}'.format(subdir)]) with util.cd(output_dir): subprocess.check_call(cmd, shell=True) mask = util.file_name(subdir + '_*', ext=EXT_IN, dir_=subdir) tree_src = glob(mask)[0] tree_dst = util.file_name(tree_file, output_ext + EXT_OUT) with open(tree_src) as in_file, open(tree_dst, 'w') as out_file: content = in_file.read() out_file.write(content.replace("'", '')) rmtree(subdir) return tree_dst
def raxml_ng_bs(fasta_file, output_dir, temp_dir, seq_type, cpus, seed, output_ext, replicates=100): """Build a bootstrapped tree with raxml.""" model = "Blosum62" if seq_type == "aa" else "GTR" tree = util.file_name(fasta_file, output_ext, output_dir) cmd = ' '.join([ 'raxml-ng', '-T {}'.format(cpus), '-f a', '-x {}'.format(seed), '-p {}'.format(seed), '-m {}'.format(model), '-# {}'.format(replicates), '-s {}'.format(fasta_file), '-n {}'.format(tree) ]) with util.cd(temp_dir): subprocess.check_call(cmd, shell=True) tree_src = join('RAxML_bipartitions.' + tree) tree_dst = join(output_dir, tree) move(tree_src, tree_dst) return tree_dst
def mask_tips(tree_file, output_dir, output_ext): """Wrap tree tip removal.""" tree = Phylo.read(tree_file, 'newick') mask_monophyletic_tips(tree) output = util.file_name(tree_file, output_ext) with util.cd(output_dir): Phylo.write(tree, output, 'newick') return output
def pxrr(tree_file, output_dir): """Unroot the tree returned by treeshrink.""" unrooted = util.file_name(tree_file) cmd = ' '.join([ 'pxrr', '--unroot', '--treef {}'.format(tree_file), '--outf {}'.format(unrooted) ]) with util.cd(output_dir): subprocess.check_call(cmd, shell=True) util.remove_files('phyx.logfile') return unrooted
def cut_branches(tree_file, output_dir, output_ext, branch_cutoff, min_taxa): """Cut long internal branches.""" tree = Phylo.read(tree_file, 'newick') subtrees = cut_deep(tree, branch_cutoff, min_taxa) with util.cd(output_dir): for i, subtree in enumerate(subtrees): output = '{}_{}'.format(tree_file, i) output = util.file_name(output, output_ext) Phylo.write(subtree, output, 'newick') return output
def fasttree(fasta_file, output_dir, output_ext, seq_type): """Build a tree with fasttree.""" cmd = ['fasttree', '-quiet'] cmd += ['-wag'] if seq_type == 'aa' else ['-nt', '-gtr'] cmd.append(fasta_file) cmd = ' '.join(cmd) tree_file = util.file_name(fasta_file, output_ext) with util.cd(output_dir): result = subprocess.check_output(cmd, shell=True) with open(tree_file, 'wb') as out_file: out_file.write(result) return tree_file
def raxml(fasta_file, output_dir, output_ext, seq_type, cpus, seed): """Build a tree with raxml.""" model = "PROTCATWAG" if seq_type == "aa" else "GTRCAT" tree = util.file_name(fasta_file, output_ext) cmd = ' '.join([ 'raxml', '-T {}'.format(cpus), '-p {}'.format(seed), '-m {}'.format(model), '-s {}'.format(fasta_file), '-n {}'.format(tree) ]) with util.cd(output_dir): subprocess.check_call(cmd, shell=True) tree_src = 'RAxML_bestTree.' + tree move(tree_src, tree) util.remove_files('RAxML_*') return tree
def raxml_ng(fasta_file, output_dir, temp_dir, seq_type, cpus, seed, output_ext): """Build a tree with raxml.""" model = "Blosum62" if seq_type == "aa" else "GTR" tree = util.file_name(fasta_file, output_ext) cmd = ' '.join([ 'raxml-ng', '-T {}'.format(cpus), '-p {}'.format(seed), '-m {}'.format(model), '-s {}'.format(fasta_file), '-n {}'.format(tree) ]) with util.cd(temp_dir): subprocess.check_call(cmd, shell=True) tree_src = join('RAxML_bestTree.' + tree) tree_dst = join(output_dir, tree) move(tree_src, tree_dst) return tree_dst
def prank(fasta_file, output_dir, temp_dir, seq_type): """Align sequences.""" in_path = fasta_file if seq_type == 'aa': in_path = bio.adjust_aa_seqs(fasta_file, temp_dir) aligned = util.file_name(fasta_file, 'ortho.aln') cmd = [ 'prank', '-d {}'.format(in_path), '-o {}'.format(aligned), '-protein' if seq_type == 'aa' else '-DNA', ] cmd = ' '.join(cmd) with util.cd(temp_dir): result = subprocess.check_output(cmd) with open(aligned, 'wb') as out_file: out_file.write(result) return aligned
def pasta(fasta_file, output_dir, output_ext, seq_type, cpus): """Align sequences.""" in_path = fasta_file if seq_type == 'aa': in_path = bio.adjust_aa_seqs(fasta_file, output_dir) cmd = ' '.join([ which('run_pasta.py'), '--datatype {}'.format('Protein' if seq_type == 'aa' else 'DNA'), '--num-cpus {}'.format(cpus), "--input '{}'".format(in_path), "--output-directory '{}'".format(abspath(output_dir)) ]) with util.cd(output_dir): subprocess.check_call(cmd, shell=True) base_name = splitext(basename(fasta_file))[0] temp_aligned = 'pastajob.marker001.' + base_name + EXT aligned = base_name + output_ext move(temp_aligned, aligned) util.remove_files('pastajob*') return aligned
def pxclsq(fasta_file, output_dir, output_ext, seq_type, min_occupancy, min_len): """Filter aligned sequences for occupancy and length.""" ext = output_ext + EXT_PXCLSQ temp_cleaned = util.file_name(fasta_file, ext) cmd = ' '.join([ 'pxclsq', '--aminoacid' if seq_type == 'aa' else '', '--prop {}'.format(min_occupancy), '--seqf {}'.format(fasta_file), '--outf {}'.format(basename(temp_cleaned)) ]) cleaned = util.file_name(fasta_file, output_ext) with util.cd(output_dir): subprocess.check_call(cmd, shell=True) with open(temp_cleaned) as in_file, open(cleaned, 'w') as out_file: for header, seq in SimpleFastaParser(in_file): if len(seq.replace('-', '')) >= min_len: bio.write_fasta_record(out_file, header, seq) util.remove_files('phyx.logfile') return cleaned
def raxml_bs(fasta_file, output_dir, output_ext, seq_type, cpus, seed, replicates=100): """Build a bootstrapped tree with raxml.""" model = "PROTCATWAG" if seq_type == "aa" else "GTRCAT" tree = util.file_name(fasta_file, output_ext) cmd = ' '.join([ 'raxml', '-T {}'.format(cpus), '-f a', '-x {}'.format(seed), '-p {}'.format(seed), '-m {}'.format(model), '-# {}'.format(replicates), '-s {}'.format(fasta_file), '-n {}'.format(tree) ]) with util.cd(output_dir): subprocess.check_call(cmd, shell=True) tree_src = 'RAxML_bipartitions.' + tree move(tree_src, tree) util.remove_files('RAxML_*') return tree