def create_RNACluster_fa(path, folders_dict): """ input: '.fna', '_RNA_nuc_dict.cpk', 'allclusters.cpk' output: '.aln', 'tree.json', etc """ RNA_path = folders_dict['RNA_path'] RNA_dict = load_pickle('%s%s' % (RNA_path, 'all_RNA_seq.cpk')) ## load RNA cluster cpk file diamond_RNACluster_dt = load_pickle(RNA_path + 'allclusters.cpk') ## load RNAID_to_RNASeqID RNASeqID cpk file RNAID_to_RNASeqID_dict = load_pickle(path + 'RNAID_to_SeqID.cpk') ## create cluster-RNAs fasta files (By default: put RNAs in geneCluster folder) fasta_path = path + 'geneCluster/' ## diamond_RNACluster_dt: {clusterID:[ count_strains,[memb1,...],count_RNAs } for clusterID, RNA in diamond_RNACluster_dt.iteritems(): ## RNACluster file name RNA_cluster_nu_filename = "%s%s" % (clusterID, '.fna') RNA_cluster_nu_write = open(fasta_path + RNA_cluster_nu_filename, 'wb') ## write nucleotide/amino_acid sequences into RNACluster files for RNA_memb in RNA[1]: ## RNA_name format: strain_1|locusTag strain_name = RNA_memb.split('|')[0] RNA_memb_seq = str(RNA_dict[strain_name][RNA_memb]) RNASeqID = RNAID_to_RNASeqID_dict[RNA_memb] write_in_fa(RNA_cluster_nu_write, RNASeqID, RNA_memb_seq) RNA_cluster_nu_write.close() return diamond_RNACluster_dt
def create_geneCluster_fa(path,folders_dict): """ dict storing amino_acid Id/Seq from '.faa' files input: '.faa', '_gene_nuc_dict.cpk', 'allclusters.cpk' output: """ ## make sure the geneCluster folder is empty os.system('rm -rf %s'%(path+'geneCluster/')) clustering_path= folders_dict['clustering_path'] geneCluster_dt= load_pickle(clustering_path+'allclusters.cpk') protein_path= folders_dict['protein_path'] nucleotide_path= folders_dict['nucleotide_path'] geneID_to_geneSeqID_dict=load_pickle(path+'geneID_to_geneSeqID.cpk') gene_aa_dict= load_pickle('%s%s'%(protein_path,'all_protein_seq.cpk')) gene_na_dict= load_pickle('%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk')) ## create cluster-genes fasta files cluster_seqs_path=path+'geneCluster/' os.system('mkdir '+cluster_seqs_path) ## write nuc/aa sequences for each cluster for clusterID, gene in geneCluster_dt.iteritems(): ## geneCluster file name gene_cluster_nu_filename="%s%s"%(clusterID,'.fna') gene_cluster_aa_filename="%s%s"%(clusterID,'.faa') with open( cluster_seqs_path+gene_cluster_nu_filename, 'wb') as gene_cluster_nu_write, \ open( cluster_seqs_path+gene_cluster_aa_filename, 'wb') as gene_cluster_aa_write: ## write nucleotide/amino_acid sequences into geneCluster files for gene_memb in gene[1]: ## gene_name format: strain_1|locusTag strain_name= gene_memb.split('|')[0] geneSeqID=geneID_to_geneSeqID_dict[gene_memb] write_in_fa(gene_cluster_nu_write, geneSeqID, gene_na_dict[strain_name][gene_memb] ) write_in_fa(gene_cluster_aa_write, geneSeqID, gene_aa_dict[strain_name][gene_memb])
def create_RNACluster_fa(path,folders_dict): """ input: '.fna', '_RNA_nuc_dict.cpk', 'allclusters.cpk' output: '.aln', 'tree.json', etc """ RNA_path= folders_dict['RNA_path'] RNA_dict= load_pickle('%s%s'%(RNA_path,'all_RNA_seq.cpk')) ## load RNA cluster cpk file diamond_RNACluster_dt=load_pickle(RNA_path+'allclusters.cpk') ## load RNAID_to_RNASeqID RNASeqID cpk file RNAID_to_RNASeqID_dict=load_pickle(path+'RNAID_to_SeqID.cpk') ## create cluster-RNAs fasta files (By default: put RNAs in geneCluster folder) fasta_path=path+'geneCluster/'; ## diamond_RNACluster_dt: {clusterID:[ count_strains,[memb1,...],count_RNAs } for clusterID, RNA in diamond_RNACluster_dt.iteritems(): ## RNACluster file name RNA_cluster_nu_filename="%s%s"%(clusterID,'.fna') RNA_cluster_nu_write=open( fasta_path+RNA_cluster_nu_filename, 'wb') ## write nucleotide/amino_acid sequences into RNACluster files for RNA_memb in RNA[1]: ## RNA_name format: strain_1|locusTag strain_name= RNA_memb.split('|')[0] RNA_memb_seq=str(RNA_dict[strain_name][RNA_memb]) RNASeqID=RNAID_to_RNASeqID_dict[RNA_memb] write_in_fa(RNA_cluster_nu_write, RNASeqID, RNA_memb_seq ) RNA_cluster_nu_write.close() return diamond_RNACluster_dt
def RNAclusters_align_makeTree(path, folders_dict, parallel, simple_tree): """ create RNA clusters as nucleotide fasta files and build individual RNA trees based on fna files """ diamond_RNACluster_dt = create_RNACluster_fa(path, folders_dict) ## align, build_tree, make_RNATree_json fasta_path = path + 'geneCluster/' fa_files = glob.glob(fasta_path + "*RC*.fna") multips(single_RNACluster_align_and_makeTree, parallel, fa_files, fasta_path, simple_tree) ## add RNA cluster in diamond_geneCluster_dt ### load gene cluster geneClusterPath = '%s%s' % (path, 'protein_faa/diamond_matches/') os.system( 'cp %sallclusters_postprocessed.cpk %s/allclusters_postprocessed.cpk.bk ' % (geneClusterPath, geneClusterPath)) diamond_geneCluster_dt = load_pickle(geneClusterPath + 'allclusters_postprocessed.cpk') ### update gene cluster with RNA cluster update_gene_cluster_with_RNA(path, diamond_RNACluster_dt, diamond_geneCluster_dt) ### update diversity file update_diversity_cpk(path)
def postprocess_paralogs_iterative(parallel, path, nstrains, simple_tree, paralog_branch_cutoff, disable_long_branch_splitting, paralog_frac_cutoff=0.3, plot=0): cluster_path = path + 'protein_faa/diamond_matches/' clusters_need_split = 'allclusters_postprocessed.cpk' if not disable_long_branch_splitting else 'allclusters.cpk' geneCluster_dt = load_pickle(cluster_path + clusters_need_split) ## folder that contains old split clusters in paralog splitting step geneClusters_fpath = path + 'geneCluster/' os.system('mkdir ' + geneClusters_fpath + 'paralog_splits/') if os.path.exists(''.join( [geneClusters_fpath, 'old_clusters_paralogSplit.txt'])): os.system(''.join( ['rm ', geneClusters_fpath, 'old_clusters_paralogSplit.txt'])) split_result = postprocess_paralogs( parallel, path, nstrains, simple_tree, geneCluster_dt, set(), paralog_branch_cutoff=paralog_branch_cutoff, paralog_frac_cutoff=paralog_frac_cutoff, plot=0) n_split_clusters, new_fa_files_set = split_result iteration = 0 while (n_split_clusters): print '---- split a total of ', n_split_clusters, 'in iteration', iteration split_result = postprocess_paralogs( parallel, path, nstrains, simple_tree, geneCluster_dt, new_fa_files_set, paralog_branch_cutoff=paralog_branch_cutoff, paralog_frac_cutoff=paralog_frac_cutoff, plot=plot) n_split_clusters, new_fa_files_set = split_result iteration += 1 ## write gene_diversity_Dt cpk file update_diversity_cpk(path) ## remove old gene cluster and create new split cluster update_geneCluster_cpk(path, geneCluster_dt) if os.path.exists(''.join( [geneClusters_fpath, 'old_clusters_paralogSplit.txt'])): with open(geneClusters_fpath + 'old_clusters_paralogSplit.txt', 'r') as delete_cluster_file: deleted_file_count = len([clus for clus in delete_cluster_file]) print '#clusters split during the checking paralogy:', deleted_file_count
def update_geneCluster_dt(path,geneCluster_dt): """ add new cluster statistics in folder update_long_branch_splits geneCluster_dt: geneCluster dict to be updated """ update_long_branch_splits=''.join([path,'geneCluster/update_long_branch_splits/']) for ifile in glob.iglob(update_long_branch_splits+'*.cpk'): for k,v in load_pickle(ifile).iteritems(): #print('adding newly split clusters %s'%k) geneCluster_dt[k] = v
def load_sorted_clusters(path): ''' load gene clusters and sort 1st by abundance and then by clusterID ''' geneClusterPath='%s%s'%(path,'protein_faa/diamond_matches/') geneCluster_dt=load_pickle(geneClusterPath+'allclusters_postprocessed.cpk') from operator import itemgetter # sort by decreasing abundance (-v[0], minus to achieve decreasing) # followed by increasing strain count return sorted(geneCluster_dt.iteritems(), key=lambda (k,v): (-itemgetter(0)(v),itemgetter(2)(v)), reverse=False)
def update_geneCluster_dt(path, geneCluster_dt): """ add new cluster statistics in folder update_long_branch_splits geneCluster_dt: geneCluster dict to be updated """ update_long_branch_splits = ''.join( [path, 'geneCluster/update_long_branch_splits/']) for ifile in glob.iglob(update_long_branch_splits + '*.cpk'): for k, v in load_pickle(ifile).iteritems(): #print('adding newly split clusters %s'%k) geneCluster_dt[k] = v
def make_genepresence_alignment(path, disable_gain_loss, merged_gain_loss_output): ''' loop over all gene clusters and append 0/1 to strain specific string used as pseudo alignment of gene presence absence ''' geneClusterPath = '%s%s' % (path, 'protein_fna/diamond_matches/') output_path = '%s%s' % (path, 'geneCluster/') ## load strain list and prepare for gene presence/absence strain_list = load_pickle('%s%s' % (path, 'strain_list.cpk')) set_totalStrain = set([istrain for istrain in strain_list]) totalStrain = len(set_totalStrain) dt_strainGene = defaultdict(str) sorted_genelist = load_sorted_clusters(path) ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for clusterID, gene in sorted_genelist: ## append 0/1 to each strain create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene[1]) with open('%s%s' % (output_path, 'genePresence.aln'), 'wb') as presence_outfile: for istkey in dt_strainGene: write_in_fa(presence_outfile, istkey, dt_strainGene[istkey]) write_pickle('%s%s' % (output_path, 'dt_genePresence.cpk'), dt_strainGene) if disable_gain_loss: geneEvents_dt = {i: 0 for i in range(len(sorted_genelist))} write_pickle('%s%s' % (output_path, 'dt_geneEvents.cpk'), geneEvents_dt) if merged_gain_loss_output: gene_loss_fname = '%s%s' % (output_path, 'geneGainLossEvent.json') write_json(dt_strainGene, gene_loss_fname, indent=1) else: ## strainID as key, presence pattern as value (converted into np.array) keylist = dt_strainGene.keys() keylist.sort() strainID_keymap = {ind: k for ind, k in enumerate(keylist) } # dict(zip(keylist, range(3))) presence_arr = np.array([ np.array(dt_strainGene[k], 'c') for k in keylist ]) # 0: present, 3: absent presence_arr[presence_arr == '1'] = '3' for ind, (clusterID, gene) in enumerate(sorted_genelist): pattern_dt = { strainID_keymap[strain_ind]: str(patt) for strain_ind, patt in enumerate(presence_arr[:, ind]) } pattern_fname = '%s%s_patterns.json' % (output_path, clusterID) write_json(pattern_dt, pattern_fname, indent=1)
def integrate_clusters(clustering_path, cluster_fpath): """ integrate all clusters """ ## representative ID as key, original gene IDs as value representative_to_origin_dict=defaultdict() for idict in glob.iglob(clustering_path+"*_dicts.cpk"): subproblem_run_number=idict.split('/')[-1].split('subproblem_')[1].split('_')[0] representative_to_origin_dict[subproblem_run_number]=load_pickle(idict) with open('%s%s'%(clustering_path,'subproblem_finalRound_cluster.output'))\ as finalRound_cluster,\ open(cluster_fpath,'wb') as integrated_cluster: for iline in finalRound_cluster: integrated_cluster.write('%s\n'%'\t'.join([geneID for representativeID in iline.rstrip().split('\t') \ for geneID in representative_to_origin_dict[representativeID.split('GCs')[1].split('_')[0]][representativeID] ]))
def integrate_clusters(clustering_path, cluster_fpath): """ integrate all clusters """ ## representative ID as key, original gene IDs as value representative_to_origin_dict = defaultdict() for idict in glob.iglob(clustering_path + "*_dicts.cpk"): subproblem_run_number = idict.split('/')[-1].split( 'subproblem_')[1].split('_')[0] representative_to_origin_dict[subproblem_run_number] = load_pickle( idict) with open('%s%s'%(clustering_path,'subproblem_finalRound_cluster.output'))\ as finalRound_cluster,\ open(cluster_fpath,'wb') as integrated_cluster: for iline in finalRound_cluster: integrated_cluster.write('%s\n'%'\t'.join([geneID for representativeID in iline.rstrip().split('\t') \ for geneID in representative_to_origin_dict[representativeID.split('GCs')[1].split('_')[0]][representativeID] ]))
def make_genepresence_alignment(path, disable_gain_loss, merged_gain_loss_output): ''' loop over all gene clusters and append 0/1 to strain specific string used as pseudo alignment of gene presence absence ''' geneClusterPath='%s%s'%(path,'protein_fna/diamond_matches/') output_path='%s%s'%(path,'geneCluster/'); ## load strain list and prepare for gene presence/absence strain_list= load_pickle('%s%s'%(path,'strain_list.cpk')) set_totalStrain=set([ istrain for istrain in strain_list ]) totalStrain=len(set_totalStrain) dt_strainGene= defaultdict(str) sorted_genelist = load_sorted_clusters(path) ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for clusterID, gene in sorted_genelist: ## append 0/1 to each strain create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene[1]) with open('%s%s'%(output_path,'genePresence.aln'),'wb') as presence_outfile: for istkey in dt_strainGene: write_in_fa( presence_outfile, istkey, dt_strainGene[istkey]) write_pickle('%s%s'%(output_path,'dt_genePresence.cpk'), dt_strainGene) if disable_gain_loss: geneEvents_dt={ i:0 for i in range(len(sorted_genelist)) } write_pickle('%s%s'%(output_path,'dt_geneEvents.cpk'), geneEvents_dt) if merged_gain_loss_output: gene_loss_fname='%s%s'%(output_path,'geneGainLossEvent.json') write_json(dt_strainGene, gene_loss_fname, indent=1) else: ## strainID as key, presence pattern as value (converted into np.array) keylist= dt_strainGene.keys(); keylist.sort() strainID_keymap= {ind:k for ind, k in enumerate(keylist)} # dict(zip(keylist, range(3))) presence_arr= np.array([ np.array(dt_strainGene[k],'c') for k in keylist]) # 0: present, 3: absent presence_arr[presence_arr=='1']='3' for ind, (clusterID, gene) in enumerate(sorted_genelist): pattern_dt= { strainID_keymap[strain_ind]:str(patt) for strain_ind, patt in enumerate(presence_arr[:, ind])} pattern_fname='%s%s_patterns.json'%(output_path,clusterID) write_json(pattern_dt, pattern_fname, indent=1)
def RNAclusters_align_makeTree( path, folders_dict, parallel, simple_tree ): """ create RNA clusters as nucleotide fasta files and build individual RNA trees based on fna files """ diamond_RNACluster_dt=create_RNACluster_fa(path,folders_dict) ## align, build_tree, make_RNATree_json fasta_path = path+'geneCluster/' fa_files=glob.glob(fasta_path+"*RC*.fna") multips(single_RNACluster_align_and_makeTree, parallel, fa_files, fasta_path, simple_tree) ## add RNA cluster in diamond_geneCluster_dt ### load gene cluster geneClusterPath='%s%s'%(path,'protein_faa/diamond_matches/') os.system('cp %sallclusters_postprocessed.cpk %s/allclusters_postprocessed.cpk.bk '%(geneClusterPath,geneClusterPath)) diamond_geneCluster_dt=load_pickle(geneClusterPath+'allclusters_postprocessed.cpk') ### update gene cluster with RNA cluster update_gene_cluster_with_RNA(path, diamond_RNACluster_dt, diamond_geneCluster_dt) ### update diversity file update_diversity_cpk(path)
def __init__(self, **kwargs): for k, v in kwargs.iteritems(): setattr(self, k, v) #self.params_dict[k]=v todos self.folders_dict=defaultdict( str, gbk_path='input_GenBank/', nucleotide_path='nucleotide_fna/', protein_path='protein_faa/', clustering_path='protein_faa/diamond_matches/', RNA_path='RNA_fna/', cluster_seq_path='geneCluster/', tmp_core_seq_path='tmp_core/', vis_json_path='vis/', vis_cluster_path='vis/geneCluster/', log_path='log/') # set up folder structure and files names self.organize_folders() self.specify_filepath() if os.path.exists(self.fpaths_dict['strain_cpk']): self.strain_list=load_pickle(self.fpaths_dict['strain_cpk']) self.nstrains=len(self.strain_list)
def __init__(self, **kwargs): for k, v in kwargs.iteritems(): setattr(self, k, v) #self.params_dict[k]=v todos self.folders_dict = defaultdict( str, gbk_path='input_GenBank/', nucleotide_path='nucleotide_fna/', protein_path='protein_faa/', clustering_path='protein_faa/diamond_matches/', RNA_path='RNA_fna/', cluster_seq_path='geneCluster/', tmp_core_seq_path='tmp_core/', vis_json_path='vis/', vis_cluster_path='vis/geneCluster/', log_path='log/') # set up folder structure and files names self.organize_folders() self.specify_filepath() if os.path.exists(self.fpaths_dict['strain_cpk']): self.strain_list = load_pickle(self.fpaths_dict['strain_cpk']) self.nstrains = len(self.strain_list)
def postprocess_paralogs_iterative(parallel, path, nstrains, simple_tree, paralog_branch_cutoff, disable_long_branch_splitting, paralog_frac_cutoff=0.3, plot=0): cluster_path= path+'protein_faa/diamond_matches/' clusters_need_split='allclusters_postprocessed.cpk' if not disable_long_branch_splitting else 'allclusters.cpk' geneCluster_dt=load_pickle(cluster_path+clusters_need_split) ## folder that contains old split clusters in paralog splitting step geneClusters_fpath=path+'geneCluster/' os.system('mkdir '+geneClusters_fpath+'paralog_splits/') if os.path.exists(''.join([geneClusters_fpath,'old_clusters_paralogSplit.txt'])): os.system(''.join(['rm ',geneClusters_fpath,'old_clusters_paralogSplit.txt'])) split_result= postprocess_paralogs( parallel, path, nstrains, simple_tree, geneCluster_dt, set(), paralog_branch_cutoff=paralog_branch_cutoff, paralog_frac_cutoff=paralog_frac_cutoff, plot=0) n_split_clusters, new_fa_files_set = split_result iteration=0 while(n_split_clusters): print '---- split a total of ',n_split_clusters, 'in iteration', iteration split_result= postprocess_paralogs( parallel, path, nstrains, simple_tree, geneCluster_dt, new_fa_files_set, paralog_branch_cutoff=paralog_branch_cutoff, paralog_frac_cutoff=paralog_frac_cutoff, plot=plot) n_split_clusters, new_fa_files_set = split_result iteration+=1 ## write gene_diversity_Dt cpk file update_diversity_cpk(path) ## remove old gene cluster and create new split cluster update_geneCluster_cpk(path, geneCluster_dt) if os.path.exists(''.join([geneClusters_fpath,'old_clusters_paralogSplit.txt'])): with open(geneClusters_fpath+'old_clusters_paralogSplit.txt', 'r') as delete_cluster_file: deleted_file_count=len([ clus for clus in delete_cluster_file ]) print '#clusters split during the checking paralogy:',deleted_file_count
def create_core_SNP_matrix(path, core_cutoff=1.0, core_gene_strain_fpath=''): #1.0 """ create SNP matrix using core gene SNPs input: strain_list.cpk, core_geneList.cpk output: SNP_whole_matrix.aln core_cutoff: percentage of strains used to decide whether a gene is core default: 1.0 (strictly core gene, which is present in all strains) customized: 0.9 ( soft core, considered as core if present in 90% of strains) """ import os, sys, operator import numpy as np import numpy.ma as ma from collections import defaultdict from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa alnFilePath = '%s%s' % (path, 'geneCluster/') output_path = alnFilePath ## create core gene list corelist = [] strain_list = load_pickle(path + 'strain_list.cpk') totalStrain = len(strain_list) sorted_geneList = load_sorted_clusters(path) if core_gene_strain_fpath != '': with open(core_gene_strain_fpath, 'rb') as core_gene_strain_file: core_strain_set = set( [i.rstrip().replace('-', '_') for i in core_gene_strain_file]) with open(output_path + 'core_geneList.txt', 'wb') as outfile: for clusterID, vg in sorted_geneList: if core_cutoff == 1.0: strain_core_cutoff = totalStrain else: strain_core_cutoff = int(totalStrain * core_cutoff) if vg[0] == vg[2] and vg[0] >= strain_core_cutoff: coreGeneName = '%s%s' % (clusterID, '_na_aln.fa') ## sequences might be discarded because of premature stops coreGeneName_path = alnFilePath + coreGeneName if os.path.exists(coreGeneName_path) and len( read_fasta(coreGeneName_path)) >= strain_core_cutoff: if core_gene_strain_fpath != '' and len( core_strain_set - set([i.split('|')[0] for i in vg[1]])) != 0: continue outfile.write(coreGeneName + '\n') corelist.append(coreGeneName) else: #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene') pass write_pickle(output_path + 'core_geneList.cpk', corelist) refSeqList = load_pickle(path + 'strain_list.cpk') refSeqList.sort() snp_fre_lst = [] snp_wh_matrix_flag = 0 snp_pos_dt = defaultdict(list) snp_whole_matrix = np.array([]) snps_by_gene = [] for align_file in corelist: ## core genes nuc_array = np.array([]) # array to store nucleotides for each gene gene_seq_dt = read_fasta(alnFilePath + align_file) if core_cutoff != 1.0: # set sequences for missing gene (space*gene_length) missing_gene_seq = ' ' * len(gene_seq_dt.values()[0]) totalStrain_sorted_lst = sorted(strain_list) # build strain_seq_dt from gene_seq_dt strain_seq_dt = defaultdict() for gene, seq in gene_seq_dt.iteritems(): strain_seq_dt[gene.split('-')[0]] = seq # strain-locus_tag-... strain_seq_sorted_lst = sorted(strain_seq_dt.items(), key=lambda x: x[0]) start_flag = 0 if core_cutoff == 1.0: for ka, va in strain_seq_sorted_lst: if start_flag == 0: nuc_array = np.array(np.fromstring(va, dtype='S1')) start_flag = 1 else: nuc_array = np.vstack( (nuc_array, np.fromstring(va, dtype='S1'))) ## find SNP positions position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis=0) position_has_gap = np.any(nuc_array == '-', axis=0) position_SNP = position_polymorphic & (~position_has_gap) snp_columns = nuc_array[:, position_SNP] snp_pos_dt[align_file] = np.where(position_SNP)[0] else: ## add '-' for missing genes when dealing with soft core genes core_gene_strain = [gene for gene in strain_seq_dt.keys()] for strain in totalStrain_sorted_lst: if start_flag == 0: if strain in core_gene_strain: nuc_array = np.array( np.fromstring(strain_seq_dt[strain], dtype='S1')) else: print 'Soft core gene: gene absent in strain %s on cluster %s' % ( strain, align_file) nuc_array = np.array( np.fromstring(missing_gene_seq, dtype='S1')) start_flag = 1 else: if strain in core_gene_strain: nuc_array = np.vstack( (nuc_array, np.fromstring(strain_seq_dt[strain], dtype='S1'))) else: print 'Soft core gene: gene absent in strain %s on cluster %s' % ( strain, align_file) nuc_array = np.vstack((nuc_array, np.fromstring(missing_gene_seq, dtype='S1'))) ## find SNP positions ## mask missing genes -- determine rows that have ' ' in every column is_missing = np.all(nuc_array == ' ', axis=1) masked_non_missing_array = np.ma.masked_array( nuc_array, nuc_array == ' ') position_polymorphic = np.any( masked_non_missing_array != masked_non_missing_array[0, :], axis=0) position_has_gap = np.any(masked_non_missing_array == '-', axis=0) position_SNP = position_polymorphic & (~position_has_gap) # the below seems duplicated from 5 lines above?? if is_missing.sum() > 0: # with missing genes nuc_array[is_missing] = '-' snp_columns = nuc_array[:, position_SNP] snp_pos_dt[align_file] = np.where(position_SNP)[0] #print snp_columns if snp_wh_matrix_flag == 0: snp_whole_matrix = snp_columns snp_wh_matrix_flag = 1 else: snp_whole_matrix = np.hstack((snp_whole_matrix, snp_columns)) write_pickle(output_path + 'snp_pos.cpk', snp_pos_dt) with open(output_path + 'SNP_whole_matrix.aln', 'wb') as outfile: for ind, isw in enumerate(snp_whole_matrix): write_in_fa(outfile, refSeqList[ind], isw.tostring())
def create_core_SNP_matrix(path, core_cutoff=1.0, core_gene_strain_fpath=''):#1.0 """ create SNP matrix using core gene SNPs input: strain_list.cpk, core_geneList.cpk output: SNP_whole_matrix.aln core_cutoff: percentage of strains used to decide whether a gene is core default: 1.0 (strictly core gene, which is present in all strains) customized: 0.9 ( soft core, considered as core if present in 90% of strains) """ import os,sys,operator import numpy as np import numpy.ma as ma from collections import defaultdict from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa alnFilePath='%s%s'%(path,'geneCluster/') output_path= alnFilePath ## create core gene list corelist=[] strain_list=load_pickle(path+'strain_list.cpk') totalStrain= len(strain_list) sorted_geneList = load_sorted_clusters(path) if core_gene_strain_fpath!='': with open(core_gene_strain_fpath,'rb') as core_gene_strain_file: core_strain_set= set([i.rstrip().replace('-','_') for i in core_gene_strain_file]) with open(output_path+'core_geneList.txt','wb') as outfile: for clusterID, vg in sorted_geneList: if core_cutoff==1.0: strain_core_cutoff=totalStrain else: strain_core_cutoff=int(totalStrain*core_cutoff) if vg[0]==vg[2] and vg[0]>=strain_core_cutoff: coreGeneName='%s%s'%(clusterID,'_na_aln.fa') ## sequences might be discarded because of premature stops coreGeneName_path= alnFilePath+coreGeneName if os.path.exists(coreGeneName_path) and len(read_fasta(coreGeneName_path)) >= strain_core_cutoff: if core_gene_strain_fpath!='' and len(core_strain_set-set([i.split('|')[0] for i in vg[1]]))!=0: continue outfile.write(coreGeneName+'\n') corelist.append(coreGeneName) else: #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene') pass write_pickle(output_path+'core_geneList.cpk',corelist) refSeqList=load_pickle(path+'strain_list.cpk');refSeqList.sort() snp_fre_lst=[]; snp_wh_matrix_flag=0 snp_pos_dt=defaultdict(list); snp_whole_matrix=np.array([]) snps_by_gene=[] for align_file in corelist:## core genes nuc_array=np.array([]) # array to store nucleotides for each gene gene_seq_dt=read_fasta(alnFilePath+align_file) if core_cutoff!=1.0: # set sequences for missing gene (space*gene_length) missing_gene_seq=' '*len(gene_seq_dt.values()[0]) totalStrain_sorted_lst=sorted(strain_list) # build strain_seq_dt from gene_seq_dt strain_seq_dt=defaultdict() for gene, seq in gene_seq_dt.iteritems(): strain_seq_dt[gene.split('-')[0]]=seq # strain-locus_tag-... strain_seq_sorted_lst=sorted(strain_seq_dt.items(), key=lambda x: x[0]) start_flag=0 if core_cutoff==1.0: for ka, va in strain_seq_sorted_lst: if start_flag==0: nuc_array=np.array(np.fromstring(va, dtype='S1')) start_flag=1 else: nuc_array=np.vstack((nuc_array,np.fromstring(va, dtype='S1'))) ## find SNP positions position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis = 0) position_has_gap = np.any(nuc_array=='-', axis=0) position_SNP = position_polymorphic&(~position_has_gap) snp_columns = nuc_array[:,position_SNP] snp_pos_dt[align_file]=np.where(position_SNP)[0] else: ## add '-' for missing genes when dealing with soft core genes core_gene_strain=[ gene for gene in strain_seq_dt.keys()] for strain in totalStrain_sorted_lst: if start_flag==0: if strain in core_gene_strain: nuc_array=np.array(np.fromstring(strain_seq_dt[strain], dtype='S1')) else: print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file) nuc_array=np.array(np.fromstring(missing_gene_seq, dtype='S1')) start_flag=1 else: if strain in core_gene_strain: nuc_array=np.vstack((nuc_array,np.fromstring(strain_seq_dt[strain], dtype='S1'))) else: print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file) nuc_array=np.vstack((nuc_array,np.fromstring(missing_gene_seq, dtype='S1'))) ## find SNP positions ## mask missing genes -- determine rows that have ' ' in every column is_missing = np.all(nuc_array==' ',axis=1) masked_non_missing_array= np.ma.masked_array(nuc_array, nuc_array==' ') position_polymorphic = np.any(masked_non_missing_array!= masked_non_missing_array[0, :],axis = 0) position_has_gap = np.any(masked_non_missing_array=='-',axis=0) position_SNP = position_polymorphic&(~position_has_gap) # the below seems duplicated from 5 lines above?? if is_missing.sum()>0: # with missing genes nuc_array[is_missing]='-' snp_columns = nuc_array[:,position_SNP] snp_pos_dt[align_file]=np.where(position_SNP)[0] #print snp_columns if snp_wh_matrix_flag==0: snp_whole_matrix=snp_columns; snp_wh_matrix_flag=1 else: snp_whole_matrix=np.hstack((snp_whole_matrix, snp_columns)) write_pickle(output_path+'snp_pos.cpk',snp_pos_dt) with open(output_path+'SNP_whole_matrix.aln','wb') as outfile: for ind, isw in enumerate(snp_whole_matrix): write_in_fa( outfile, refSeqList[ind], isw.tostring() )
def clustering_protein(path, folders_dict, threads, blast_fpath, roary_fpath, orthofinder_fpath, other_tool_fpath, diamond_evalue, diamond_max_target_seqs, diamond_identity, diamond_query_cover, diamond_subject_cover, diamond_path, mcl_inflation): ''' Procedure: all-against-all protein comparison + hits filtering + mcl clustering By default: DIAMOND -> BS -> MCL Alternatives: 1. Blastp output (user-provided) -> BS -> MCL 2. Roary 3. OrthoFinder 4. Other tools. params: path: path to directory including data and output threads: number of parallel threads used to run diamond blast_fpath: gene clusters by all-vs-all blast comparison and other clusterings methods roary_fpath: gene clusters by roary diamond_max_target_seqs: Diamond setting: the maximum number of target sequences per query to keep alignments for. Defalut:600 #strain * #max_duplication= 50*10= 500 ''' threads=str(threads) protein_path= folders_dict['protein_path'] clustering_path= folders_dict['clustering_path'] cluster_fpath= '%s%s'%(clustering_path,'allclusters.tsv') cluster_dt_cpk_fpath='%s%s'%(clustering_path,'allclusters.cpk') if any( i!='none' for i in [blast_fpath, roary_fpath, orthofinder_fpath, other_tool_fpath]): geneID_to_geneSeqID_dict= load_pickle('%sgeneID_to_geneSeqID.cpk'%path) locus_tag_to_geneID_dict= defaultdict(list) for geneID in geneID_to_geneSeqID_dict.keys(): locus_tag=geneID.split('|')[1] locus_tag_to_geneID_dict[locus_tag]=geneID ## using standard pipeline (roary_fpath=='none') if all( i=='none' for i in [blast_fpath, roary_fpath, orthofinder_fpath, other_tool_fpath]): dmd_ref_file='reference.faa' ## prepare dmd_query_file (dmd_query_file is dmd_ref_file) os.system(''.join(['cat ',protein_path,'*faa > ',clustering_path,dmd_ref_file])) ## run diamond diamond_run(clustering_path, dmd_ref_file, threads, diamond_evalue, diamond_max_target_seqs, diamond_identity, diamond_query_cover, diamond_subject_cover, diamond_path ) ## filtering hits via BS score filter_hits_single(clustering_path, threads) ## running mcl mcl_run(clustering_path, threads, mcl_inflation) ## clean up diamond_query_file os.system(''.join(['rm ',clustering_path,'*faa'])) elif blast_fpath!='none': ## using user-given cluster file based on blast os.system(''.join(['cp ',blast_fpath,' ',clustering_path,'blastp.m8'])) ## filtering hits via BS score filter_hits_single(clustering_path, threads, input_prefix='blastp') ## running mcl mcl_run(clustering_path, threads, mcl_inflation, input_prefix='blastp') elif roary_fpath!='none': ## using cluster files from roary roary_cluster_process(locus_tag_to_geneID_dict, roary_fpath, cluster_fpath) # with open(roary_fpath, 'rb') as cluster_external_file: # with open(cluster_fpath, 'wb') as cluster_final_file: # for cluster_line in cluster_external_file: # cluster_final_file.write( '%s\n'%'\t'.join([ gene_tag.replace('_','|') if '|' not in gene_tag else gene_tag for gene_tag in cluster_line.rstrip().split(': ')[1].split('\t')]) ) elif orthofinder_fpath!='none': process_orthofinder(orthofinder_fpath,cluster_fpath) elif other_tool_fpath!='none': os.system('cp %s %s'%(other_tool_fpath,cluster_fpath)) cleanup_clustering(clustering_path) return parse_geneCluster(cluster_fpath, cluster_dt_cpk_fpath)
def estimate_core_gene_diversity(path, folders_dict, strain_list, parallel, core_cutoff, factor_core_diversity, species): """ estimate core gene diversity before gene cluster alignment and cluster post-processing """ totalStrain = len(strain_list) ## load clusters clustering_path = folders_dict['clustering_path'] geneCluster_dt = load_pickle(clustering_path + 'allclusters.cpk') protein_path = folders_dict['protein_path'] nucleotide_path = folders_dict['nucleotide_path'] protein_dict_path = '%s%s' % (protein_path, 'all_protein_seq.cpk') nucleotide_dict_path = '%s%s' % (nucleotide_path, 'all_nucleotide_seq.cpk') tmp_core_seq_path = '%s%s' % (clustering_path, 'tmp_core/') ## load geneID_to_geneSeqID geneSeqID cpk file geneID_to_geneSeqID_dict = load_pickle(path + 'geneID_to_geneSeqID.cpk') ## create core gene list core_geneCluster_dt = defaultdict() # geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes } for clusterID, cluster_stats in geneCluster_dt.iteritems(): if core_cutoff == 1.0: strain_core_cutoff = totalStrain else: strain_core_cutoff = int(totalStrain * core_cutoff) ## check whether #genes == #strains and it's a core/soft-core gene if cluster_stats[0] == cluster_stats[ 2] and cluster_stats[0] >= strain_core_cutoff: core_geneCluster_dt[clusterID] = cluster_stats if os.path.exists(tmp_core_seq_path): os.system(''.join(['rm -rf ', tmp_core_seq_path])) os.system('mkdir %s' % tmp_core_seq_path) ## create dict storing all genes' translation if 0: gene_aa_dict = defaultdict(dict) for accession_id in strain_list: gene_aa_dict[accession_id] = read_fasta(''.join( [protein_path, accession_id, '.faa'])) write_pickle(protein_dict_path, gene_aa_dict) ## create dict for all gene's nucleotide sequence gene_na_dict = defaultdict(dict) for accession_id in strain_list: gene_na_dict[accession_id] = read_fasta(''.join( [nucleotide_path, accession_id, '.fna'])) write_pickle(nucleotide_dict_path, gene_na_dict) gene_aa_dict = load_pickle(protein_dict_path) gene_na_dict = load_pickle(nucleotide_dict_path) ## write nucleotide and amino-acid sequences for each gene cluster export_cluster_seq_tmp(tmp_core_seq_path, core_geneCluster_dt, geneID_to_geneSeqID_dict, gene_na_dict, gene_aa_dict) tmp_fa_files = glob.glob(tmp_core_seq_path + "*.fna") multips(calculate_diversity, parallel, tmp_fa_files, tmp_core_seq_path, species) calculated_core_diversity = tmp_average_core_diversity(tmp_core_seq_path) refined_core_diversity = round( (0.1 + factor_core_diversity * calculated_core_diversity) / (1 + factor_core_diversity * calculated_core_diversity), 4) print('factor used: ' + str(factor_core_diversity)) print('average core genome diversity: ' + str(calculated_core_diversity)) print( 'defined core genome diversity cutoff for splitting long branches: ' + str(refined_core_diversity)) ## move folder tmp_core to the central data folder new_clustering_path = '%stmp_core' % path if os.path.exists(new_clustering_path): os.system(''.join(['rm -r ', new_clustering_path])) os.system('mv %s %s' % (tmp_core_seq_path, path)) return calculated_core_diversity, refined_core_diversity
def postprocess_split_long_branch(parallel, path, simple_tree, cut_branch_threshold=0.3): """ Split tree via breaking up long branches. Remote homology leads to over-clustering. This yields tree with long branches. """ file_path = ''.join([path, 'geneCluster/']) new_split_folder = ''.join([file_path, 'update_long_branch_splits/']) if os.path.exists(new_split_folder): ## remove the folder from previous run os.system(''.join(['rm -r ', new_split_folder])) os.system(''.join(['mkdir ', new_split_folder])) deleted_clusters_folder = ''.join( [file_path, 'deleted_clusters_longSplit/']) if os.path.exists(deleted_clusters_folder): os.system(''.join(['rm -r ', deleted_clusters_folder])) os.system(''.join(['mkdir ', deleted_clusters_folder])) ## load clusters cluster_path = '%s%s' % (path, 'protein_faa/diamond_matches/') geneCluster_dt = load_pickle(cluster_path + 'allclusters.cpk') ## gather all trees generated before postprocessing tree_path = file_path tree_fname_list = glob.glob(tree_path + '*nwk') ## ensure that writing to new_clusters_longSplit starts at the beginning (for re-running) if os.path.exists(''.join([file_path, 'new_clusters_longSplit.txt'])): os.system(''.join(['rm ', file_path, 'new_clusters_longSplit.txt'])) if os.path.exists(''.join([file_path, 'old_clusters_longSplit.txt'])): os.system(''.join(['rm ', file_path, 'old_clusters_longSplit.txt'])) # ============================================= # parallelization: # "post-clustering workflow for splitting trees on over-clustered records" treefile_used = True multips(cutTree_outputCluster, parallel, tree_fname_list, file_path, cut_branch_threshold, treefile_used) ## If new_clusters_longSplit.txt (over_split records) exists, ## then gather new clusters from new_clusters_longSplit.txt if os.path.exists(''.join([file_path, 'new_clusters_longSplit.txt'])): with open(file_path + 'new_clusters_longSplit.txt', 'rb') as new_clusters_longSplit: new_fa_files_list = [ clus.rstrip() for clus in new_clusters_longSplit ] print '#times of splitting long branches:', len( new_fa_files_list) - 1 with open(file_path + 'old_clusters_longSplit.txt', 'rb') as delete_cluster_file: deleted_file_count = len([clus for clus in delete_cluster_file]) print '#clusters split during the checking of long branches:', deleted_file_count ## parallelization of "align and make tree on new cluster" multips(align_and_makeTree, parallel, new_fa_files_list, file_path, simple_tree) # ============================================= ## delete original clusters which are split delete_original_clusters(file_path, geneCluster_dt) ## add newly split clusters update_geneCluster_dt(path, geneCluster_dt) ## write updated gene clusters in cpk file update_geneCluster_cpk(path, geneCluster_dt) ## write gene_diversity_Dt cpk file update_diversity_cpk(path) os.system(' '.join([ 'mv ', file_path + 'new_clusters_longSplit.txt', file_path + 'added_clusters_split_long.txt' ])) os.system(' '.join([ 'mv ', file_path + 'old_clusters_longSplit.txt', file_path + 'deleted_clusters_split_long.txt' ])) else: # no clusters postprocessed os.system(' '.join([ 'cp', cluster_path + 'allclusters.cpk', cluster_path + 'allclusters_postprocessed.cpk' ]))
def geneCluster_to_json(path, enable_RNA_clustering, store_locus_tag, raw_locus_tag, optional_table_column): """ create json file for gene cluster table visualzition input: path to genecluster output output: geneCluster.json """ # define path and make output directory geneCluster_path = '%s%s' % (path, 'geneCluster/') output_path = '%s%s' % (path, 'vis/') # open files geneClusterJSON_outfile = open(output_path + 'geneCluster.json', 'wb') ##store locus_tags in a separate file for large dataset if store_locus_tag: locus_tag_outfile = open(path + 'search_locus_tag.tsv', 'wb') ### load precomputed annotations, diversity, associations etc # load geneID_to_descriptions geneID_to_descriptions = load_pickle(path + 'geneID_to_description.cpk') if enable_RNA_clustering: # load RNAID_to_description_file geneID_to_descriptions.update( load_pickle(path + 'RNAID_to_description.cpk')) gene_diversity_Dt = load_pickle(geneCluster_path + 'gene_diversity.cpk') ## load gain/loss event count dictionary dt_geneEvents = load_pickle(geneCluster_path + 'dt_geneEvents.cpk') ## load association branch_associations_path = path + 'branch_association.cpk' if os.path.isfile(branch_associations_path): branch_associations = load_pickle(branch_associations_path) else: branch_associations = {} presence_absence_associations_path = path + 'presence_absence_association.cpk' if os.path.isfile(presence_absence_associations_path): presence_absence_associations = load_pickle( presence_absence_associations_path) else: presence_absence_associations = {} ## load list of clustered sorted by strain count sorted_genelist = load_sorted_clusters(path) geneClusterJSON_outfile.write('[') ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for gid, (clusterID, gene) in enumerate(sorted_genelist): strain_count, gene_list, gene_count = gene # #print strain_count, gene_count if gid != 0: ## begin geneClusterJSON_outfile.write(',\n') ## annotation majority allAnn, majority_annotation = consolidate_annotation( path, gene_list, geneID_to_descriptions) ## geneName majority all_geneName, majority_geneName = consolidate_geneName( path, gene_list, geneID_to_descriptions) ## extract gain/loss event count gene_event = dt_geneEvents[gid] ## average length seqs = read_fasta(geneCluster_path + '%s%s' % (clusterID, '.fna')).values() geneClusterLength = int(np.mean([len(igene) for igene in seqs])) ## msa #geneCluster_aln='%s%s'%(clusterID,'_aa.aln') geneCluster_aln = clusterID ## check for duplicates if gene_count > strain_count: duplicated_state = 'yes' dup_list = [ig.split('|')[0] for ig in gene_list] # "#" to delimit (gene/gene_count)key/value ; "@" to seperate genes # Counter({'g1': 2, 'g2': 1}) dup_detail = ''.join([ '%s#%s@' % (kd, vd) for kd, vd in Counter(dup_list).iteritems() if vd > 1 ])[:-1] else: duplicated_state = 'no' dup_detail = '' ## locus_tag if raw_locus_tag: # make a string of all locus tags [1] in igl.split('|') all_locus_tags = ' '.join([igl.split('|')[1] for igl in gene_list]) else: # in addition to locus tag, keep strain name (but replace '|') all_locus_tags = ' '.join( [igl.replace('|', '_') for igl in gene_list]) ## optionally store locus tags to file, remove from geneClusterJSON if store_locus_tag: locus_tag_outfile.write('%s\t%s\n' % (clusterID, all_locus_tags)) all_locus_tags = '' ## default cluster json fields cluster_json_line = [ '"geneId":' + str(gid + 1), '"geneLen":' + str(geneClusterLength), '"count":' + str(strain_count), '"dupli":"' + duplicated_state + '"', '"dup_detail":"' + dup_detail + '"', '"ann":"' + majority_annotation + '"', '"msa":"' + geneCluster_aln + '"', '"divers":"' + gene_diversity_Dt[clusterID] + '"', '"event":"' + str(gene_event) + '"', '"allAnn":"' + allAnn + '"', '"GName":"' + majority_geneName + '"', '"allGName":"' + all_geneName + '"', '"locus":"' + all_locus_tags + '"' ] if optional_table_column: cluster_json_line.extend( optional_geneCluster_properties(gene_list, optional_table_column)) if clusterID in branch_associations: cluster_json_line.extend( geneCluster_associations(branch_associations[clusterID], suffix='BA')) if clusterID in presence_absence_associations: cluster_json_line.extend( geneCluster_associations( presence_absence_associations[clusterID], suffix='PA')) #write file cluster_json_line = ','.join(cluster_json_line) geneClusterJSON_outfile.write('{' + cluster_json_line + '}') # close files geneClusterJSON_outfile.write(']') geneClusterJSON_outfile.close() if store_locus_tag: locus_tag_outfile.close()
def clustering_protein(path, folders_dict, threads, blast_fpath, roary_fpath, orthofinder_fpath, other_tool_fpath, diamond_evalue, diamond_max_target_seqs, diamond_identity, diamond_query_cover, diamond_subject_cover, diamond_path, mcl_inflation): ''' Procedure: all-against-all protein comparison + hits filtering + mcl clustering By default: DIAMOND -> BS -> MCL Alternatives: 1. Blastp output (user-provided) -> BS -> MCL 2. Roary 3. OrthoFinder 4. Other tools. params: path: path to directory including data and output threads: number of parallel threads used to run diamond blast_fpath: gene clusters by all-vs-all blast comparison and other clusterings methods roary_fpath: gene clusters by roary diamond_max_target_seqs: Diamond setting: the maximum number of target sequences per query to keep alignments for. Defalut:600 #strain * #max_duplication= 50*10= 500 ''' threads = str(threads) protein_path = folders_dict['protein_path'] clustering_path = folders_dict['clustering_path'] cluster_fpath = '%s%s' % (clustering_path, 'allclusters.tsv') cluster_dt_cpk_fpath = '%s%s' % (clustering_path, 'allclusters.cpk') if any(i != 'none' for i in [blast_fpath, roary_fpath, orthofinder_fpath, other_tool_fpath]): geneID_to_geneSeqID_dict = load_pickle('%sgeneID_to_geneSeqID.cpk' % path) locus_tag_to_geneID_dict = defaultdict(list) for geneID in geneID_to_geneSeqID_dict.keys(): locus_tag = geneID.split('|')[1] locus_tag_to_geneID_dict[locus_tag] = geneID ## using standard pipeline (roary_fpath=='none') if all(i == 'none' for i in [blast_fpath, roary_fpath, orthofinder_fpath, other_tool_fpath]): dmd_ref_file = 'reference.faa' ## prepare dmd_query_file (dmd_query_file is dmd_ref_file) os.system(''.join( ['cat ', protein_path, '*faa > ', clustering_path, dmd_ref_file])) ## run diamond diamond_run(clustering_path, dmd_ref_file, threads, diamond_evalue, diamond_max_target_seqs, diamond_identity, diamond_query_cover, diamond_subject_cover, diamond_path) ## filtering hits via BS score filter_hits_single(clustering_path, threads) ## running mcl mcl_run(clustering_path, threads, mcl_inflation) ## clean up diamond_query_file os.system(''.join(['rm ', clustering_path, '*faa'])) elif blast_fpath != 'none': ## using user-given cluster file based on blast os.system(''.join( ['cp ', blast_fpath, ' ', clustering_path, 'blastp.m8'])) ## filtering hits via BS score filter_hits_single(clustering_path, threads, input_prefix='blastp') ## running mcl mcl_run(clustering_path, threads, mcl_inflation, input_prefix='blastp') elif roary_fpath != 'none': ## using cluster files from roary roary_cluster_process(locus_tag_to_geneID_dict, roary_fpath, cluster_fpath) # with open(roary_fpath, 'rb') as cluster_external_file: # with open(cluster_fpath, 'wb') as cluster_final_file: # for cluster_line in cluster_external_file: # cluster_final_file.write( '%s\n'%'\t'.join([ gene_tag.replace('_','|') if '|' not in gene_tag else gene_tag for gene_tag in cluster_line.rstrip().split(': ')[1].split('\t')]) ) elif orthofinder_fpath != 'none': process_orthofinder(orthofinder_fpath, cluster_fpath) elif other_tool_fpath != 'none': os.system('cp %s %s' % (other_tool_fpath, cluster_fpath)) cleanup_clustering(clustering_path) return parse_geneCluster(cluster_fpath, cluster_dt_cpk_fpath)
def postprocess_unclustered_genes(parallel, path, nstrains, simple_tree, split_long_branch_cutoff, window_size_smoothed=5, strain_proportion=0.3 , sigma_scale=3): """ 1) detect suspicious peaks in the distribution of average length of genes in gene cluster (aa count) np.bincount([1,2,3,34,3]) -> how often each entry is found np.convolve([1,1,1,1,1], gene_length_count) -> unclustered genes will contribute many small clusters (size 1) that result in peaks in the distribution 2) for each peak detected, align the sequences of all genes in clusters in peak 3) to cluster aligned genes, build tree. However, to ensure long branches -> between unaligned sub-alignment, fill gaps with random sequence (skipped, not tested) importantly, this random sequence needs to be the same in different columns of the alignment. - random sequence a la rseq = [alpha[ii] for ii in np.random.randint(len(alpha), size=aln.get_aligmentlength())] - for seq in aln: seq[seq=='-'] = rseq[seq=='-'] 4) make and split tree at branches > 0.5 5) for each subtree (ideally only one big tree), define new gene cluster and run maketree_align from standard step 6 """ geneCluster_fasta_path = ''.join([path,'geneCluster/']) new_split_folder= ''.join([geneCluster_fasta_path,'update_long_branch_splits/']) if os.path.exists(new_split_folder): ## remove the folder from previous run os.system(''.join(['rm -r ',new_split_folder])) os.system(''.join(['mkdir ',new_split_folder])) deleted_clusters_folder=''.join([geneCluster_fasta_path,'deleted_clusters_peaks_splits/']) if os.path.exists(deleted_clusters_folder): os.system(''.join(['rm -r ',deleted_clusters_folder])) os.system(''.join(['mkdir ',deleted_clusters_folder])) ## load clusters ClusterPath='%s%s'%(path,'protein_faa/diamond_matches/') geneCluster_dt=load_pickle(ClusterPath+'allclusters_postprocessed.cpk') ## merge unclustered genes merged_clusters_dict=defaultdict(list) merged_clusters_dict=find_and_merge_unclustered_genes(path, nstrains, window_size_smoothed, strain_proportion , sigma_scale) if len(merged_clusters_dict)!=0: ## there are merged clusters corresponding to the cluster peaks ## ensure that writing to new_clusters_longSplit starts at the beginning (for re-running) if os.path.exists(''.join([geneCluster_fasta_path,'new_clusters_longSplit.txt'])): os.system(''.join(['rm ',geneCluster_fasta_path,'new_clusters_longSplit.txt'])) if os.path.exists(''.join([geneCluster_fasta_path,'old_clusters_longSplit.txt'])): os.system(''.join(['rm ',geneCluster_fasta_path,'old_clusters_longSplit.txt'])) cut_branch_threshold=split_long_branch_cutoff#0.3 ## cut tree and make new clusters cut_all_trees_from_merged_clusters(parallel, path, cut_branch_threshold, simple_tree) ## update clusters in allclusters_final.cpk #os.system('cp %sallclusters_final.cpk %s/allclusters_final.cpk.bk '%(ClusterPath,ClusterPath)) ## delete old clusters delete_old_merged_clusters(geneCluster_fasta_path, geneCluster_dt, merged_clusters_dict) ## add newly split clusters update_geneCluster_dt(path,geneCluster_dt) ## write updated gene clusters in cpk file update_geneCluster_cpk(path,geneCluster_dt) ## write gene_diversity_Dt cpk file update_diversity_cpk(path)
def postprocess_unclustered_genes(parallel, path, nstrains, simple_tree, split_long_branch_cutoff, window_size_smoothed=5, strain_proportion=0.3, sigma_scale=3): """ 1) detect suspicious peaks in the distribution of average length of genes in gene cluster (aa count) np.bincount([1,2,3,34,3]) -> how often each entry is found np.convolve([1,1,1,1,1], gene_length_count) -> unclustered genes will contribute many small clusters (size 1) that result in peaks in the distribution 2) for each peak detected, align the sequences of all genes in clusters in peak 3) to cluster aligned genes, build tree. However, to ensure long branches -> between unaligned sub-alignment, fill gaps with random sequence (skipped, not tested) importantly, this random sequence needs to be the same in different columns of the alignment. - random sequence a la rseq = [alpha[ii] for ii in np.random.randint(len(alpha), size=aln.get_aligmentlength())] - for seq in aln: seq[seq=='-'] = rseq[seq=='-'] 4) make and split tree at branches > 0.5 5) for each subtree (ideally only one big tree), define new gene cluster and run maketree_align from standard step 6 """ geneCluster_fasta_path = ''.join([path, 'geneCluster/']) new_split_folder = ''.join( [geneCluster_fasta_path, 'update_long_branch_splits/']) if os.path.exists(new_split_folder): ## remove the folder from previous run os.system(''.join(['rm -r ', new_split_folder])) os.system(''.join(['mkdir ', new_split_folder])) deleted_clusters_folder = ''.join( [geneCluster_fasta_path, 'deleted_clusters_peaks_splits/']) if os.path.exists(deleted_clusters_folder): os.system(''.join(['rm -r ', deleted_clusters_folder])) os.system(''.join(['mkdir ', deleted_clusters_folder])) ## load clusters ClusterPath = '%s%s' % (path, 'protein_faa/diamond_matches/') geneCluster_dt = load_pickle(ClusterPath + 'allclusters_postprocessed.cpk') ## merge unclustered genes merged_clusters_dict = defaultdict(list) merged_clusters_dict = find_and_merge_unclustered_genes( path, nstrains, window_size_smoothed, strain_proportion, sigma_scale) if len(merged_clusters_dict) != 0: ## there are merged clusters corresponding to the cluster peaks ## ensure that writing to new_clusters_longSplit starts at the beginning (for re-running) if os.path.exists(''.join( [geneCluster_fasta_path, 'new_clusters_longSplit.txt'])): os.system(''.join( ['rm ', geneCluster_fasta_path, 'new_clusters_longSplit.txt'])) if os.path.exists(''.join( [geneCluster_fasta_path, 'old_clusters_longSplit.txt'])): os.system(''.join( ['rm ', geneCluster_fasta_path, 'old_clusters_longSplit.txt'])) cut_branch_threshold = split_long_branch_cutoff #0.3 ## cut tree and make new clusters cut_all_trees_from_merged_clusters(parallel, path, cut_branch_threshold, simple_tree) ## update clusters in allclusters_final.cpk #os.system('cp %sallclusters_final.cpk %s/allclusters_final.cpk.bk '%(ClusterPath,ClusterPath)) ## delete old clusters delete_old_merged_clusters(geneCluster_fasta_path, geneCluster_dt, merged_clusters_dict) ## add newly split clusters update_geneCluster_dt(path, geneCluster_dt) ## write updated gene clusters in cpk file update_geneCluster_cpk(path, geneCluster_dt) ## write gene_diversity_Dt cpk file update_diversity_cpk(path)
def postprocess_split_long_branch(parallel, path, simple_tree, cut_branch_threshold=0.3): """ Split tree via breaking up long branches. Remote homology leads to over-clustering. This yields tree with long branches. """ file_path = ''.join([path,'geneCluster/']) new_split_folder= ''.join([file_path,'update_long_branch_splits/']) if os.path.exists(new_split_folder): ## remove the folder from previous run os.system(''.join(['rm -r ',new_split_folder])) os.system(''.join(['mkdir ',new_split_folder])) deleted_clusters_folder=''.join([file_path,'deleted_clusters_longSplit/']) if os.path.exists(deleted_clusters_folder): os.system(''.join(['rm -r ',deleted_clusters_folder])) os.system(''.join(['mkdir ',deleted_clusters_folder])) ## load clusters cluster_path='%s%s'%(path,'protein_faa/diamond_matches/') geneCluster_dt=load_pickle(cluster_path+'allclusters.cpk') ## gather all trees generated before postprocessing tree_path = file_path tree_fname_list =glob.glob(tree_path+'*nwk') ## ensure that writing to new_clusters_longSplit starts at the beginning (for re-running) if os.path.exists(''.join([file_path,'new_clusters_longSplit.txt'])): os.system(''.join(['rm ',file_path,'new_clusters_longSplit.txt'])) if os.path.exists(''.join([file_path,'old_clusters_longSplit.txt'])): os.system(''.join(['rm ',file_path,'old_clusters_longSplit.txt'])) # ============================================= # parallelization: # "post-clustering workflow for splitting trees on over-clustered records" treefile_used=True multips(cutTree_outputCluster, parallel, tree_fname_list, file_path, cut_branch_threshold, treefile_used) ## If new_clusters_longSplit.txt (over_split records) exists, ## then gather new clusters from new_clusters_longSplit.txt if os.path.exists(''.join([file_path,'new_clusters_longSplit.txt'])): with open(file_path+'new_clusters_longSplit.txt', 'rb') as new_clusters_longSplit: new_fa_files_list=[ clus.rstrip() for clus in new_clusters_longSplit ] print '#times of splitting long branches:',len(new_fa_files_list)-1 with open(file_path+'old_clusters_longSplit.txt', 'rb') as delete_cluster_file: deleted_file_count=len([ clus for clus in delete_cluster_file ]) print '#clusters split during the checking of long branches:',deleted_file_count ## parallelization of "align and make tree on new cluster" multips(align_and_makeTree, parallel, new_fa_files_list, file_path, simple_tree) # ============================================= ## delete original clusters which are split delete_original_clusters(file_path, geneCluster_dt) ## add newly split clusters update_geneCluster_dt(path,geneCluster_dt) ## write updated gene clusters in cpk file update_geneCluster_cpk(path, geneCluster_dt) ## write gene_diversity_Dt cpk file update_diversity_cpk(path) os.system(' '.join(['mv ',file_path+'new_clusters_longSplit.txt' ,file_path+'added_clusters_split_long.txt' ])) os.system(' '.join(['mv ',file_path+'old_clusters_longSplit.txt', file_path+'deleted_clusters_split_long.txt'])) else: # no clusters postprocessed os.system(' '.join(['cp',cluster_path+'allclusters.cpk',cluster_path+'allclusters_postprocessed.cpk']))
def estimate_core_gene_diversity(path, folders_dict, strain_list, parallel, core_cutoff, factor_core_diversity, species): """ estimate core gene diversity before gene cluster alignment and cluster post-processing """ totalStrain= len(strain_list) ## load clusters clustering_path= folders_dict['clustering_path'] geneCluster_dt= load_pickle(clustering_path+'allclusters.cpk') protein_path= folders_dict['protein_path'] nucleotide_path= folders_dict['nucleotide_path'] protein_dict_path= '%s%s'%(protein_path,'all_protein_seq.cpk') nucleotide_dict_path= '%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk') tmp_core_seq_path= '%s%s'%(clustering_path,'tmp_core/') ## load geneID_to_geneSeqID geneSeqID cpk file geneID_to_geneSeqID_dict= load_pickle(path+'geneID_to_geneSeqID.cpk') ## create core gene list core_geneCluster_dt= defaultdict() # geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes } for clusterID, cluster_stats in geneCluster_dt.iteritems(): if core_cutoff==1.0: strain_core_cutoff=totalStrain else: strain_core_cutoff=int(totalStrain*core_cutoff) ## check whether #genes == #strains and it's a core/soft-core gene if cluster_stats[0]==cluster_stats[2] and cluster_stats[0]>=strain_core_cutoff: core_geneCluster_dt[clusterID]=cluster_stats if os.path.exists(tmp_core_seq_path): os.system(''.join(['rm -rf ',tmp_core_seq_path])) os.system('mkdir %s'%tmp_core_seq_path) ## create dict storing all genes' translation if 0: gene_aa_dict= defaultdict(dict) for accession_id in strain_list: gene_aa_dict[accession_id]= read_fasta(''.join([protein_path,accession_id,'.faa'])) write_pickle(protein_dict_path, gene_aa_dict) ## create dict for all gene's nucleotide sequence gene_na_dict= defaultdict(dict) for accession_id in strain_list: gene_na_dict[accession_id]=read_fasta(''.join([nucleotide_path,accession_id,'.fna'])) write_pickle(nucleotide_dict_path, gene_na_dict) gene_aa_dict= load_pickle(protein_dict_path) gene_na_dict= load_pickle(nucleotide_dict_path) ## write nucleotide and amino-acid sequences for each gene cluster export_cluster_seq_tmp(tmp_core_seq_path, core_geneCluster_dt, geneID_to_geneSeqID_dict, gene_na_dict, gene_aa_dict) tmp_fa_files=glob.glob(tmp_core_seq_path+"*.fna") multips(calculate_diversity, parallel, tmp_fa_files, tmp_core_seq_path, species) calculated_core_diversity=tmp_average_core_diversity(tmp_core_seq_path) refined_core_diversity= round((0.1+factor_core_diversity*calculated_core_diversity)/(1+factor_core_diversity*calculated_core_diversity),4) print('factor used: '+str(factor_core_diversity)) print('average core genome diversity: '+str(calculated_core_diversity)) print('defined core genome diversity cutoff for splitting long branches: '+str(refined_core_diversity)) ## move folder tmp_core to the central data folder new_clustering_path= '%stmp_core'%path if os.path.exists(new_clustering_path): os.system(''.join(['rm -r ',new_clustering_path])) os.system('mv %s %s'%(tmp_core_seq_path, path)) return calculated_core_diversity, refined_core_diversity
def geneCluster_to_json(path, enable_RNA_clustering, store_locus_tag, raw_locus_tag, optional_table_column): """ create json file for gene cluster table visualzition input: path to genecluster output output: geneCluster.json """ # define path and make output directory geneCluster_path='%s%s'%(path,'geneCluster/') output_path='%s%s'%(path,'vis/') # open files geneClusterJSON_outfile=open(output_path+'geneCluster.json', 'wb') ##store locus_tags in a separate file for large dataset if store_locus_tag: locus_tag_outfile=open(path+'search_locus_tag.tsv', 'wb') ### load precomputed annotations, diversity, associations etc # load geneID_to_descriptions geneID_to_descriptions=load_pickle(path+'geneID_to_description.cpk') if enable_RNA_clustering: # load RNAID_to_description_file geneID_to_descriptions.update(load_pickle(path+'RNAID_to_description.cpk')) gene_diversity_Dt = load_pickle(geneCluster_path+'gene_diversity.cpk') ## load gain/loss event count dictionary dt_geneEvents = load_pickle(geneCluster_path+'dt_geneEvents.cpk') ## load association branch_associations_path = path+'branch_association.cpk' if os.path.isfile(branch_associations_path): branch_associations = load_pickle(branch_associations_path) else: branch_associations={} presence_absence_associations_path = path+'presence_absence_association.cpk' if os.path.isfile(presence_absence_associations_path): presence_absence_associations = load_pickle(presence_absence_associations_path) else: presence_absence_associations={} ## load list of clustered sorted by strain count sorted_genelist = load_sorted_clusters(path) geneClusterJSON_outfile.write('[') ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for gid, (clusterID, gene) in enumerate(sorted_genelist): strain_count, gene_list, gene_count = gene # #print strain_count, gene_count if gid!=0: ## begin geneClusterJSON_outfile.write(',\n') ## annotation majority allAnn, majority_annotation = consolidate_annotation(path, gene_list, geneID_to_descriptions) ## geneName majority all_geneName, majority_geneName = consolidate_geneName(path, gene_list, geneID_to_descriptions) ## extract gain/loss event count gene_event= dt_geneEvents[gid] ## average length seqs = read_fasta(geneCluster_path+'%s%s'%(clusterID,'.fna')).values() geneClusterLength = int(np.mean([ len(igene) for igene in seqs])) ## msa #geneCluster_aln='%s%s'%(clusterID,'_aa.aln') geneCluster_aln=clusterID ## check for duplicates if gene_count>strain_count: duplicated_state='yes' dup_list=[ ig.split('|')[0] for ig in gene_list] # "#" to delimit (gene/gene_count)key/value ; "@" to seperate genes # Counter({'g1': 2, 'g2': 1}) dup_detail=''.join(['%s#%s@'%(kd,vd) for kd, vd in Counter(dup_list).iteritems() if vd>1 ])[:-1] else: duplicated_state='no';dup_detail='' ## locus_tag if raw_locus_tag: # make a string of all locus tags [1] in igl.split('|') all_locus_tags=' '.join([ igl.split('|')[1] for igl in gene_list ]) else: # in addition to locus tag, keep strain name (but replace '|') all_locus_tags=' '.join([ igl.replace('|','_') for igl in gene_list ]) ## optionally store locus tags to file, remove from geneClusterJSON if store_locus_tag: locus_tag_outfile.write('%s\t%s\n'%(clusterID,all_locus_tags)) all_locus_tags='' ## default cluster json fields cluster_json_line=['"geneId":'+str(gid+1), '"geneLen":'+str(geneClusterLength), '"count":'+str(strain_count), '"dupli":"'+duplicated_state+'"', '"dup_detail":"'+dup_detail+'"', '"ann":"'+majority_annotation+'"', '"msa":"'+geneCluster_aln+'"', '"divers":"'+gene_diversity_Dt[clusterID]+'"', '"event":"'+str(gene_event)+'"', '"allAnn":"'+allAnn+'"', '"GName":"'+majority_geneName+'"', '"allGName":"'+all_geneName+'"', '"locus":"'+all_locus_tags+'"' ] if optional_table_column: cluster_json_line.extend(optional_geneCluster_properties(gene_list,optional_table_column)) if clusterID in branch_associations: cluster_json_line.extend(geneCluster_associations(branch_associations[clusterID], suffix='BA')) if clusterID in presence_absence_associations: cluster_json_line.extend(geneCluster_associations(presence_absence_associations[clusterID], suffix='PA')) #write file cluster_json_line=','.join(cluster_json_line) geneClusterJSON_outfile.write('{'+cluster_json_line+'}') # close files geneClusterJSON_outfile.write(']') geneClusterJSON_outfile.close() if store_locus_tag: locus_tag_outfile.close()