def aln_to_Newick(path, folders_dict, raxml_timelimit, raxml_path, threads): """ function: build core gene SNP tree using SNP alignment input: SNP_whole_matrix.aln output: strain_tree.nwk """ cluster_seq_path=folders_dict['cluster_seq_path'] log_path=folders_dict['log_path'] output_path = '_'.join([cluster_seq_path+'temp_coretree', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))]) os.system('mkdir %s'%output_path) SNP_matrix_path=cluster_seq_path+'SNP_whole_matrix.aln' cwd = os.getcwd() os.chdir(output_path) ## run fasttree start = time.time(); fasttree_program= 'fasttree' if check_dependency('fasttree') else 'FastTree' os.system(fasttree_program+' -gtr -nt -gamma -nosupport -mlacc 2 -slownni '+SNP_matrix_path+' > initial_tree.newick0 2> '+log_path+'fasttree.log') ; print ' fasttree time-cost:', times(start) resolve_polytomies('initial_tree.newick0','initial_tree.newick') ## run raxml start = time.time(); out_fname = "tree_infer.newick" if raxml_timelimit>0: print '%s%d%s'%('RAxML tree optimization within the timelimit of ',raxml_timelimit, ' minutes') # exec for killing process end_time = time.time() + int(raxml_timelimit*60) # raxml_program= 'raxml' if check_dependency('raxml') else 'raxmlHPC' process = subprocess.Popen('exec '+raxml_program+' -f d -T '+str(threads)+' -j -s '+SNP_matrix_path+' -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick > '+log_path+'raxml.log', shell=True) while (time.time() < end_time): if os.path.isfile('RAxML_result.topology'): break time.sleep(10) process.terminate() checkpoint_files = glob.glob('RAxML_checkpoint*') if os.path.isfile('RAxML_result.topology'): checkpoint_files.append('RAxML_result.topology') if len(checkpoint_files) > 0: last_tree_file = checkpoint_files[-1] shutil.copy(last_tree_file, 'raxml_tree.newick') else: shutil.copy('initial_tree.newick', 'raxml_tree.newick') else: shutil.copy('initial_tree.newick', 'raxml_tree.newick') print 'RAxML branch length optimization and rooting' os.system(raxml_program+' -f e -T '+str(threads)+' -s '+SNP_matrix_path+' -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick > '+log_path+'raxml.log') shutil.copy('RAxML_result.branches', out_fname) print ' raxml time-cost:', times(start) midpointRooting(out_fname,'strain_tree.nwk') shutil.copy('strain_tree.nwk', cluster_seq_path+'strain_tree.nwk') os.chdir(cwd) os.system('rm -r %s'%output_path)
def single_RNACluster_align_and_makeTree(fa_files_list, alignFile_path, simple_tree): fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree' for RNA_cluster_nu_filename in fa_files_list: try: # extract GC_RNA002 from path/GC_RNA002.aln clusterID = RNA_cluster_nu_filename.split('/')[-1].split('.')[0] geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a') if len( read_fasta(RNA_cluster_nu_filename) )==1: # nothing to do for singletons ## na.aln RNA_cluster_nu_aln_filename= RNA_cluster_nu_filename.replace('.fna','_na.aln') ## RNA SeqID separator '|' is replaced by '-' for msa viewer compatibility with open(RNA_cluster_nu_aln_filename,'wb') as write_file: for SeqID, Sequence in read_fasta(RNA_cluster_nu_filename).iteritems(): write_in_fa(write_file, SeqID.replace('|','-'), Sequence) geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0')) else: # align and build tree print RNA_cluster_nu_filename myTree = mpm_tree(RNA_cluster_nu_filename) myTree.align() if simple_tree==False: myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True) myTree.ancestral(translate_tree=True) myTree.refine(CDS=False) else: myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False) myTree.diversity_statistics_nuc() myTree.export(path=alignFile_path, RNA_specific=True) RNA_diversity_values='{0:.3f}'.format(myTree.diversity_nuc) geneDiversity_file.write('%s\t%s\n'%(clusterID,RNA_diversity_values)) print clusterID,RNA_diversity_values except: print("Aligning and tree building of RNA %s failed"%RNA_cluster_nu_filename)
def cutTree_outputCluster( file_list, file_path, cut_branch_threshold, treefile_used): """ process flow for parallelization to cut the tree and output the clades in new clusters """ new_fa_files=set() fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree' for input_filepath in file_list: if treefile_used==True: ## read tree input_cluster_filename=input_filepath.split('/')[-1].replace('.nwk','.fna') try: tree= Phylo.read(input_filepath, 'newick') except: print 'reading tree failed: ',input_filepath else: ## make tree input_cluster_filename=input_filepath.split('/')[-1] tree= quick_align_makeTree(input_filepath,fasttree_name) ## attempt to cut the tree gene_list, rest_genes = cut_tree_gather_clades(tree,cut_branch_threshold) ## add to-be-deleted cluster records if len(gene_list)!=0 and '_r' not in input_cluster_filename: ## 1st check: original cluster has been split ## 2nd check: it's not a "further-split" cluster ## from an already split cluster with open(file_path+'old_clusters_longSplit.txt', 'a') as delete_cluster_file: #print 'delete clusters that have been split: ',input_cluster_filename delete_cluster_file.write('%s\n'%input_cluster_filename) ## output cutted clusters if len(gene_list)==0: ## nothing can be further cutted, ## cutting process for current tree will stop. if '_r' not in input_cluster_filename: ## a tree does not need to be split, skip the following pass#continue else: ## this's a list of rest genes which cannot be split. ## fill gene_list with genes in rest_genes gene_list=rest_genes ## set the rest_genes to empty list rest_genes=[] else: ## further process on left-over genes if len(rest_genes)!=0: output_cutted_clusters(file_path, input_cluster_filename, rest_genes, cut_branch_threshold, treefile_used=False, cut_leftover=True) ## write clades in gene_list into clusters output_cutted_clusters(file_path, input_cluster_filename, gene_list, cut_branch_threshold, treefile_used=False, cut_leftover=False)
def align_and_makeTree( fna_file_list, alignFile_path, simple_tree): fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree' for gene_cluster_nu_filename in fna_file_list: try: # extract GC_00002 from path/GC_00002.aln clusterID = gene_cluster_nu_filename.split('/')[-1].split('.')[0] start = time.time(); geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a') if len( read_fasta(gene_cluster_nu_filename) )==1: # nothing to do for singletons ## na_aln.fa gene_cluster_nu_aln_filename= gene_cluster_nu_filename.replace('.fna','_na_aln.fa') ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility with open(gene_cluster_nu_aln_filename,'wb') as write_file: for SeqID, Sequence in read_fasta(gene_cluster_nu_filename).iteritems(): write_in_fa(write_file, SeqID.replace('|','-'), Sequence) os.system( ' '.join(['cp',gene_cluster_nu_aln_filename,gene_cluster_nu_aln_filename.replace('_aln','_aln_reduced')]) ) ## aa_aln.fa gene_cluster_aa_filename= gene_cluster_nu_filename.replace('.fna','.faa') gene_cluster_aa_aln_filename= gene_cluster_nu_filename.replace('.fna','_aa_aln.fa') ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility with open(gene_cluster_aa_aln_filename,'wb') as write_file: for SeqID, Sequence in read_fasta(gene_cluster_aa_filename).iteritems(): write_in_fa(write_file, SeqID.replace('|','-'), Sequence) os.system( ' '.join(['cp',gene_cluster_aa_aln_filename,gene_cluster_aa_aln_filename.replace('_aln','_aln_reduced')]) ) geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0')) else: # align and build tree #print gene_cluster_nu_filename myTree = mpm_tree(gene_cluster_nu_filename) myTree.codon_align() myTree.translate() if simple_tree==False: myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True) myTree.ancestral(translate_tree=True) myTree.refine() else: myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False) myTree.diversity_statistics_nuc() myTree.export(path=alignFile_path) #myTree.diversity_statistics_aa() #random_alnID=myTree.seqs.keys()[0].split('-')[0] diversity_nuc= round(myTree.diversity_nuc,3)#diversity_aa=round(myTree.diversity_aa,3) #bestSplit_paraNodes,bestSplit_branchLen = myTree.paralogy_statistics() #mean_seqLen, std_seqLen= myTree.mean_std_seqLen() #mean_seqLen, std_seqLen= [ round(i,3) for i in mean_seqLen, std_seqLen ] geneDiversity_file.write('%s\t%s\n'%(clusterID,diversity_nuc)) if 0: cluster_correl_stats_file = open(alignFile_path+'cluster_correl_stats.txt', 'a') cluster_correl_stats_file.write('%s\n'%'\t'.join([ str(i) for i in [clusterID, random_alnID, diversity_nuc, \ mean_seqLen, std_seqLen, bestSplit_paraNodes, bestSplit_branchLen ] ])) except: print("Aligning and tree building of %s failed"%gene_cluster_nu_filename)
def single_RNACluster_align_and_makeTree(fa_files_list, alignFile_path, simple_tree): fasttree_name = 'fasttree' if check_dependency('fasttree') else 'FastTree' for RNA_cluster_nu_filename in fa_files_list: if 1: #try: # extract GC_RNA002 from path/GC_RNA002.aln clusterID = RNA_cluster_nu_filename.split('/')[-1].split('.')[0] geneDiversity_file = open(alignFile_path + 'gene_diversity.txt', 'a') if len(read_fasta(RNA_cluster_nu_filename) ) == 1: # nothing to do for singletons ## na.aln RNA_cluster_nu_aln_filename = RNA_cluster_nu_filename.replace( '.fna', '_na.aln') ## RNA SeqID separator '|' is replaced by '-' for msa viewer compatibility with open(RNA_cluster_nu_aln_filename, 'wb') as write_file: for SeqID, Sequence in read_fasta( RNA_cluster_nu_filename).iteritems(): write_in_fa(write_file, SeqID.replace('|', '-'), Sequence) geneDiversity_file.write('%s\t%s\n' % (clusterID, '0.0')) else: # align and build tree print RNA_cluster_nu_filename myTree = mpm_tree(RNA_cluster_nu_filename) myTree.align() if simple_tree == False: myTree.build(raxml=False, fasttree_program=fasttree_name, treetime_used=True) myTree.ancestral(translate_tree=True) myTree.refine() else: myTree.build(raxml=False, fasttree_program=fasttree_name, treetime_used=False) myTree.diversity_statistics_nuc() myTree.export(path=alignFile_path, RNA_specific=True) RNA_diversity_values = '{0:.3f}'.format(myTree.diversity_nuc) geneDiversity_file.write('%s\t%s\n' % (clusterID, RNA_diversity_values)) print clusterID, RNA_diversity_values if 0: #except: print("Aligning and tree building of RNA %s failed" % RNA_cluster_nu_filename)
params = parser.parse_args() path = os.path.abspath(params.folder_name)+'/' if params.steps[0]=='all': ## run all steps params.steps=range(1,12) print 'Running panX in main folder: %s'%path #species=params.species_name programs={'mcl':'mcl', 'mafft':'mafft', 'fasttree':'FastTree', 'raxml':'raxmlHPC'} if params.diamond_path=='': programs['diamond']='diamond' for program_alias, program_name in programs.items(): passed=False ## check whether program_alias exists (if yes, test passed) if check_dependency(program_alias): continue ## if program_alias does not exist, check whether origin program_name exists if check_dependency(program_name): continue ## if the program is not installed, exit if not passed: if program_name=='diamond': warning='\ndiamond not found:\nplease make sure that diamond is installed '+\ 'and diamond binary file is included in the executable search path (e.g.: /usr/bin/diamond);\n'+\ 'alternatively, one can specify diamond path via the parameter -dmp (e.g.: ./panX.py -dmp /mypath/diamond -fn ...)' print warning else: print 'program '+program_name+' not found, please install it.' exit()
def cutTree_outputCluster(file_list, file_path, cut_branch_threshold, treefile_used): """ process flow for parallelization to cut the tree and output the clades in new clusters """ new_fa_files = set() fasttree_name = 'fasttree' if check_dependency('fasttree') else 'FastTree' for input_filepath in file_list: if treefile_used == True: ## read tree input_cluster_filename = input_filepath.split('/')[-1].replace( '.nwk', '.fna') try: tree = Phylo.read(input_filepath, 'newick') except: print 'reading tree failed: ', input_filepath else: ## make tree input_cluster_filename = input_filepath.split('/')[-1] tree = quick_align_makeTree(input_filepath, fasttree_name) ## attempt to cut the tree gene_list, rest_genes = cut_tree_gather_clades(tree, cut_branch_threshold) ## add to-be-deleted cluster records if len(gene_list) != 0 and '_r' not in input_cluster_filename: ## 1st check: original cluster has been split ## 2nd check: it's not a "further-split" cluster ## from an already split cluster with open(file_path + 'old_clusters_longSplit.txt', 'a') as delete_cluster_file: #print 'delete clusters that have been split: ',input_cluster_filename delete_cluster_file.write('%s\n' % input_cluster_filename) ## output cutted clusters if len(gene_list) == 0: ## nothing can be further cutted, ## cutting process for current tree will stop. if '_r' not in input_cluster_filename: ## a tree does not need to be split, skip the following pass #continue else: ## this's a list of rest genes which cannot be split. ## fill gene_list with genes in rest_genes gene_list = rest_genes ## set the rest_genes to empty list rest_genes = [] else: ## further process on left-over genes if len(rest_genes) != 0: output_cutted_clusters(file_path, input_cluster_filename, rest_genes, cut_branch_threshold, treefile_used=False, cut_leftover=True) ## write clades in gene_list into clusters output_cutted_clusters(file_path, input_cluster_filename, gene_list, cut_branch_threshold, treefile_used=False, cut_leftover=False)