def build_representative_cluster(clustering_path, threads, input_prefix): """ build representative cluster """ start = time.time() cluster_file= ''.join([clustering_path,input_prefix,'_cluster.output']) representative_outputfile= ''.join([clustering_path,input_prefix,'_representative','.faa']) subproblem_seqs_path= '%ssubproblem_cluster_seqs/'%clustering_path subproblem_merged_faa= ''.join([clustering_path,input_prefix,'.faa']) subproblem_faa_dict= read_fasta(subproblem_merged_faa) with open(cluster_file, 'rb') as cluster_input: subproblem_geneCluster_dt= defaultdict(list) cluster_input_lines= [iline for iline in cluster_input] subproblem_geneCluster_dt= {} subproblem_run_number= input_prefix.split('subproblem_')[1] for gid, iline in enumerate(cluster_input_lines):#cluster_input ## use time to avoid clusterID conflict clusterID= "GCs%s_%07d%s"%(subproblem_run_number, gid, time.strftime('%M%S',time.gmtime())) gene_ids= iline.rstrip().split('\t') subproblem_geneCluster_dt[clusterID]= gene_ids ## representative_seq representative_seq=subproblem_faa_dict[gene_ids[0]] ## write in representative strain with open(representative_outputfile, 'a') as representative_output: write_in_fa(representative_output, clusterID, representative_seq) ## write subproblem_geneCluster_dt write_pickle(''.join([clustering_path,input_prefix,'_dicts.cpk']), subproblem_geneCluster_dt) print 'build representative clusters for', input_prefix,': ', times(start), '\n'
def infer_presence_absence_associations(path, total_strains_count, min_strain_fraction_association, max_strain_fraction_association): from sf_geneCluster_align_makeTree import load_sorted_clusters from sf_coreTree_json import metadata_load metaFile= '%s%s'%(path,'metainfo.tsv') data_description = '%s%s'%(path,'meta_tidy.tsv') association_dict = defaultdict(dict) metadata = Metadata(metaFile, data_description) metadata_dict = metadata.to_dict() min_strains_association = total_strains_count*min_strain_fraction_association max_strains_association = total_strains_count*max_strain_fraction_association sorted_genelist = load_sorted_clusters(path) ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] # TODO fix vis tree = Phylo.read("%sgeneCluster/strain_tree.nwk"%(path), 'newick') assoc = PresenceAbsenceAssociation(tree, metadata_dict) for clusterID, gene in sorted_genelist: if gene[-1]>min_strains_association and gene[-1]<max_strains_association: print(clusterID) gl = load_gain_loss(path, clusterID) for col, d in metadata.data_description.iterrows(): if d['associate']=='yes': if 'log_scale' in d and d['log_scale']=='yes': t = lambda x:np.log(x) else: t = lambda x:x assoc.set_gain_loss(gl) score = assoc.calc_association_simple(d["meta_category"], transform = t) if np.isinf(score): association_dict[clusterID][d["meta_category"]] = 0.0 else: association_dict[clusterID][d["meta_category"]] = np.abs(score) write_pickle("%s/presence_absence_association.cpk"%path, association_dict)
def build_representative_cluster(clustering_path, threads, input_prefix): """ build representative cluster """ start = time.time() cluster_file = ''.join([clustering_path, input_prefix, '_cluster.output']) representative_outputfile = ''.join( [clustering_path, input_prefix, '_representative', '.faa']) subproblem_seqs_path = '%ssubproblem_cluster_seqs/' % clustering_path subproblem_merged_faa = ''.join([clustering_path, input_prefix, '.faa']) subproblem_faa_dict = read_fasta(subproblem_merged_faa) with open(cluster_file, 'rb') as cluster_input: subproblem_geneCluster_dt = defaultdict(list) cluster_input_lines = [iline for iline in cluster_input] subproblem_geneCluster_dt = {} subproblem_run_number = input_prefix.split('subproblem_')[1] for gid, iline in enumerate(cluster_input_lines): #cluster_input ## use time to avoid clusterID conflict clusterID = "GCs%s_%07d%s" % (subproblem_run_number, gid, time.strftime('%M%S', time.gmtime())) gene_ids = iline.rstrip().split('\t') subproblem_geneCluster_dt[clusterID] = gene_ids ## representative_seq representative_seq = subproblem_faa_dict[gene_ids[0]] ## write in representative strain with open(representative_outputfile, 'a') as representative_output: write_in_fa(representative_output, clusterID, representative_seq) ## write subproblem_geneCluster_dt write_pickle(''.join([clustering_path, input_prefix, '_dicts.cpk']), subproblem_geneCluster_dt) print 'build representative clusters for', input_prefix, ': ', times( start), '\n'
def infer_branch_associations(path, total_strains_count, strain_fraction_branch_association): from sf_geneCluster_align_makeTree import load_sorted_clusters from sf_coreTree_json import metadata_load metaFile= '%s%s'%(path,'metainfo.tsv') data_description = '%s%s'%(path,'meta_tidy.tsv') association_dict = defaultdict(dict) metadata = Metadata(metaFile, data_description) metadata_dict = metadata.to_dict() sorted_genelist = load_sorted_clusters(path) ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for clusterID, gene in sorted_genelist: if gene[-1]>=total_strains_count*strain_fraction_branch_association: # and clusterID=='GC00001136': print(clusterID) tree = Phylo.read("%s/geneCluster/%s.nwk"%(path, clusterID), 'newick') assoc = BranchAssociation(tree, metadata_dict) for col, d in metadata.data_description.iterrows(): if d['associate']=='yes': if 'log_scale' in d and d['log_scale']=='yes': t = lambda x:np.log(x) else: t = lambda x:x assoc.calc_up_down_averages(d["meta_category"], transform = t) max_assoc = assoc.calc_significance() association_dict[clusterID][d["meta_category"]] = max_assoc write_pickle("%s/branch_association.cpk"%path, association_dict)
def update_gene_cluster_with_RNA(path, diamond_RNACluster_dt, diamond_geneCluster_dt ): ## update gene cluster pickled file cluster_path = path+'protein_faa/diamond_matches/' diamond_geneCluster_dt.update(diamond_RNACluster_dt) write_pickle(cluster_path+'allclusters_postprocessed.cpk',diamond_geneCluster_dt)
def update_diversity_cpk(path): ## write gene_diversity_Dt cpk file output_path = path + 'geneCluster/' with open(output_path + 'gene_diversity.txt', 'rb') as infile: write_pickle(output_path + 'gene_diversity.cpk', { i.rstrip().split('\t')[0]: i.rstrip().split('\t')[1] for i in infile })
def update_gene_cluster_with_RNA(path, diamond_RNACluster_dt, diamond_geneCluster_dt): ## update gene cluster pickled file cluster_path = path + 'protein_faa/diamond_matches/' diamond_geneCluster_dt.update(diamond_RNACluster_dt) write_pickle(cluster_path + 'allclusters_postprocessed.cpk', diamond_geneCluster_dt)
def make_genepresence_alignment(path, disable_gain_loss, merged_gain_loss_output): ''' loop over all gene clusters and append 0/1 to strain specific string used as pseudo alignment of gene presence absence ''' geneClusterPath = '%s%s' % (path, 'protein_fna/diamond_matches/') output_path = '%s%s' % (path, 'geneCluster/') ## load strain list and prepare for gene presence/absence strain_list = load_pickle('%s%s' % (path, 'strain_list.cpk')) set_totalStrain = set([istrain for istrain in strain_list]) totalStrain = len(set_totalStrain) dt_strainGene = defaultdict(str) sorted_genelist = load_sorted_clusters(path) ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for clusterID, gene in sorted_genelist: ## append 0/1 to each strain create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene[1]) with open('%s%s' % (output_path, 'genePresence.aln'), 'wb') as presence_outfile: for istkey in dt_strainGene: write_in_fa(presence_outfile, istkey, dt_strainGene[istkey]) write_pickle('%s%s' % (output_path, 'dt_genePresence.cpk'), dt_strainGene) if disable_gain_loss: geneEvents_dt = {i: 0 for i in range(len(sorted_genelist))} write_pickle('%s%s' % (output_path, 'dt_geneEvents.cpk'), geneEvents_dt) if merged_gain_loss_output: gene_loss_fname = '%s%s' % (output_path, 'geneGainLossEvent.json') write_json(dt_strainGene, gene_loss_fname, indent=1) else: ## strainID as key, presence pattern as value (converted into np.array) keylist = dt_strainGene.keys() keylist.sort() strainID_keymap = {ind: k for ind, k in enumerate(keylist) } # dict(zip(keylist, range(3))) presence_arr = np.array([ np.array(dt_strainGene[k], 'c') for k in keylist ]) # 0: present, 3: absent presence_arr[presence_arr == '1'] = '3' for ind, (clusterID, gene) in enumerate(sorted_genelist): pattern_dt = { strainID_keymap[strain_ind]: str(patt) for strain_ind, patt in enumerate(presence_arr[:, ind]) } pattern_fname = '%s%s_patterns.json' % (output_path, clusterID) write_json(pattern_dt, pattern_fname, indent=1)
def parse_RNACluster(path,inputfile): """ store clusters as dictionary in cpk file """ inputfile="%s%s"%(path,inputfile) with open(inputfile, 'rb') as infile: RNACluster_dt=defaultdict(list) for gid, iline in enumerate(infile,1): ##format: NC_022226|1-1956082:1956435 col=iline.rstrip().split('\t') clusterID="RC%05d"%gid num_strains=len(dict(Counter([ ivg.split('|')[0] for ivg in col])).keys()) num_RNAs=len(dict(Counter([ ivg for ivg in col])).keys()) RNA_mem=[ icol for icol in col ] RNACluster_dt[clusterID]=[num_strains,RNA_mem,num_RNAs] write_pickle(path+'allclusters.cpk',RNACluster_dt)
def parse_geneCluster(input_fpath, output_fpath, cluster_log=False): """ store clusters as dictionary in cpk file """ with open(input_fpath, 'rb') as infile: geneCluster_dt=defaultdict(list) for gid, iline in enumerate(infile,1): ##format: NC_022226|1-1956082:1956435 col=iline.rstrip().split('\t') clusterID="GC%08d"%gid num_strains=len(dict(Counter([ ivg.split('|')[0] for ivg in col])).keys()) num_genes=len(dict(Counter([ ivg for ivg in col])).keys()) gene_mem=[ icol for icol in col ] geneCluster_dt[clusterID]=[num_strains,gene_mem,num_genes] write_pickle(output_fpath,geneCluster_dt) return geneCluster_dt
def make_strain_list(self): """ make strainID list and harmonize input filenames""" path = self.path folders_dict = self.folders_dict ## load input strains from all gbk or fasta files in self.path if self.gbk_present == 1: glob_item = '.gbk' gbk_path = folders_dict['gbk_path'] glob_list = glob.glob('%s*%s' % (path, glob_item)) if len(glob_list) != 0: harmonize_filename(path, glob_list) strain_list = [ i.split('/')[-1].split(glob_item)[0] for i in glob.iglob('%s*%s' % (path, glob_item)) ] ## move gbk files in folder input_GenBank command_organize_gbk_input = ' '.join( ['mv', path + '*gbk', gbk_path]) os.system(command_organize_gbk_input) else: gbk_glob = glob.iglob('%s*%s' % (gbk_path, glob_item)) strain_list = [ i.split('/')[-1].split(glob_item)[0] for i in gbk_glob ] else: glob_item = '.faa' glob_list = glob.glob('%s*%s' % (path, glob_item)) if len(glob_list) != 0: harmonize_filename(path, glob_list) strain_list = [ i.split('/')[-1].split(glob_item)[0] for i in glob.iglob('%s*%s' % (path, glob_item)) ] else: protein_glob = glob.iglob( '%s*%s' % (folders_dict['protein_path'], glob_item)) strain_list = [ i.split('/')[-1].split(glob_item)[0] for i in protein_glob ] command_organize_aa_input = 'mv %s*.faa %s' % ( path, folders_dict['protein_path']) command_organize_nuc_input = 'mv %s*.fna %s' % ( path, folders_dict['nucleotide_path']) os.system(command_organize_nuc_input) os.system(command_organize_aa_input) # write the list of strains to a pickle file and store the list in self write_pickle(self.fpaths_dict['strain_cpk'], strain_list) self.strain_list = strain_list self.nstrains = len(strain_list)
def parse_RNACluster(path, inputfile): """ store clusters as dictionary in cpk file """ inputfile = "%s%s" % (path, inputfile) with open(inputfile, 'rb') as infile: RNACluster_dt = defaultdict(list) for gid, iline in enumerate(infile, 1): ##format: NC_022226|1-1956082:1956435 col = iline.rstrip().split('\t') clusterID = "RC%05d" % gid num_strains = len( dict(Counter([ivg.split('|')[0] for ivg in col])).keys()) num_RNAs = len(dict(Counter([ivg for ivg in col])).keys()) RNA_mem = [icol for icol in col] RNACluster_dt[clusterID] = [num_strains, RNA_mem, num_RNAs] write_pickle(path + 'allclusters.cpk', RNACluster_dt)
def parse_geneCluster(input_fpath, output_fpath, cluster_log=False): """ store clusters as dictionary in cpk file """ with open(input_fpath, 'rb') as infile: geneCluster_dt = defaultdict(list) for gid, iline in enumerate(infile, 1): ##format: NC_022226|1-1956082:1956435 col = iline.rstrip().split('\t') clusterID = "GC%08d" % gid num_strains = len( dict(Counter([ivg.split('|')[0] for ivg in col])).keys()) num_genes = len(dict(Counter([ivg for ivg in col])).keys()) gene_mem = [icol for icol in col] geneCluster_dt[clusterID] = [num_strains, gene_mem, num_genes] write_pickle(output_fpath, geneCluster_dt) return geneCluster_dt
def make_genepresence_alignment(path, disable_gain_loss, merged_gain_loss_output): ''' loop over all gene clusters and append 0/1 to strain specific string used as pseudo alignment of gene presence absence ''' geneClusterPath='%s%s'%(path,'protein_fna/diamond_matches/') output_path='%s%s'%(path,'geneCluster/'); ## load strain list and prepare for gene presence/absence strain_list= load_pickle('%s%s'%(path,'strain_list.cpk')) set_totalStrain=set([ istrain for istrain in strain_list ]) totalStrain=len(set_totalStrain) dt_strainGene= defaultdict(str) sorted_genelist = load_sorted_clusters(path) ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for clusterID, gene in sorted_genelist: ## append 0/1 to each strain create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene[1]) with open('%s%s'%(output_path,'genePresence.aln'),'wb') as presence_outfile: for istkey in dt_strainGene: write_in_fa( presence_outfile, istkey, dt_strainGene[istkey]) write_pickle('%s%s'%(output_path,'dt_genePresence.cpk'), dt_strainGene) if disable_gain_loss: geneEvents_dt={ i:0 for i in range(len(sorted_genelist)) } write_pickle('%s%s'%(output_path,'dt_geneEvents.cpk'), geneEvents_dt) if merged_gain_loss_output: gene_loss_fname='%s%s'%(output_path,'geneGainLossEvent.json') write_json(dt_strainGene, gene_loss_fname, indent=1) else: ## strainID as key, presence pattern as value (converted into np.array) keylist= dt_strainGene.keys(); keylist.sort() strainID_keymap= {ind:k for ind, k in enumerate(keylist)} # dict(zip(keylist, range(3))) presence_arr= np.array([ np.array(dt_strainGene[k],'c') for k in keylist]) # 0: present, 3: absent presence_arr[presence_arr=='1']='3' for ind, (clusterID, gene) in enumerate(sorted_genelist): pattern_dt= { strainID_keymap[strain_ind]:str(patt) for strain_ind, patt in enumerate(presence_arr[:, ind])} pattern_fname='%s%s_patterns.json'%(output_path,clusterID) write_json(pattern_dt, pattern_fname, indent=1)
def make_strain_list(self): """ make strainID list and harmonize input filenames""" path=self.path folders_dict=self.folders_dict ## load input strains from all gbk or fasta files in self.path if self.gbk_present==1: glob_item='.gbk' gbk_path=folders_dict['gbk_path'] glob_list=glob.glob('%s*%s'%(path,glob_item)) if len(glob_list)!=0: harmonize_filename(path,glob_list) strain_list= [i.split('/')[-1].split(glob_item)[0] for i in glob.iglob('%s*%s'%(path,glob_item))] ## move gbk files in folder input_GenBank command_organize_gbk_input=' '.join(['mv', path+'*gbk',gbk_path]) os.system(command_organize_gbk_input) else: gbk_glob=glob.iglob('%s*%s'%(gbk_path,glob_item)) strain_list= [i.split('/')[-1].split(glob_item)[0] for i in gbk_glob] else: glob_item='.faa' glob_list=glob.glob('%s*%s'%(path,glob_item)) if len(glob_list)!=0: harmonize_filename(path,glob_list) strain_list=[i.split('/')[-1].split(glob_item)[0] for i in glob.iglob('%s*%s'%(path,glob_item))] else: protein_glob=glob.iglob('%s*%s'%(folders_dict['protein_path'],glob_item)) strain_list= [i.split('/')[-1].split(glob_item)[0] for i in protein_glob] command_organize_aa_input= 'mv %s*.faa %s'%(path,folders_dict['protein_path']) command_organize_nuc_input='mv %s*.fna %s'%(path,folders_dict['nucleotide_path']) os.system(command_organize_nuc_input) os.system(command_organize_aa_input) # write the list of strains to a pickle file and store the list in self write_pickle(self.fpaths_dict['strain_cpk'], strain_list) self.strain_list=strain_list self.nstrains=len(strain_list)
def create_core_SNP_matrix(path, core_cutoff=1.0, core_gene_strain_fpath=''):#1.0 """ create SNP matrix using core gene SNPs input: strain_list.cpk, core_geneList.cpk output: SNP_whole_matrix.aln core_cutoff: percentage of strains used to decide whether a gene is core default: 1.0 (strictly core gene, which is present in all strains) customized: 0.9 ( soft core, considered as core if present in 90% of strains) """ import os,sys,operator import numpy as np import numpy.ma as ma from collections import defaultdict from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa alnFilePath='%s%s'%(path,'geneCluster/') output_path= alnFilePath ## create core gene list corelist=[] strain_list=load_pickle(path+'strain_list.cpk') totalStrain= len(strain_list) sorted_geneList = load_sorted_clusters(path) if core_gene_strain_fpath!='': with open(core_gene_strain_fpath,'rb') as core_gene_strain_file: core_strain_set= set([i.rstrip().replace('-','_') for i in core_gene_strain_file]) with open(output_path+'core_geneList.txt','wb') as outfile: for clusterID, vg in sorted_geneList: if core_cutoff==1.0: strain_core_cutoff=totalStrain else: strain_core_cutoff=int(totalStrain*core_cutoff) if vg[0]==vg[2] and vg[0]>=strain_core_cutoff: coreGeneName='%s%s'%(clusterID,'_na_aln.fa') ## sequences might be discarded because of premature stops coreGeneName_path= alnFilePath+coreGeneName if os.path.exists(coreGeneName_path) and len(read_fasta(coreGeneName_path)) >= strain_core_cutoff: if core_gene_strain_fpath!='' and len(core_strain_set-set([i.split('|')[0] for i in vg[1]]))!=0: continue outfile.write(coreGeneName+'\n') corelist.append(coreGeneName) else: #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene') pass write_pickle(output_path+'core_geneList.cpk',corelist) refSeqList=load_pickle(path+'strain_list.cpk');refSeqList.sort() snp_fre_lst=[]; snp_wh_matrix_flag=0 snp_pos_dt=defaultdict(list); snp_whole_matrix=np.array([]) snps_by_gene=[] for align_file in corelist:## core genes nuc_array=np.array([]) # array to store nucleotides for each gene gene_seq_dt=read_fasta(alnFilePath+align_file) if core_cutoff!=1.0: # set sequences for missing gene (space*gene_length) missing_gene_seq=' '*len(gene_seq_dt.values()[0]) totalStrain_sorted_lst=sorted(strain_list) # build strain_seq_dt from gene_seq_dt strain_seq_dt=defaultdict() for gene, seq in gene_seq_dt.iteritems(): strain_seq_dt[gene.split('-')[0]]=seq # strain-locus_tag-... strain_seq_sorted_lst=sorted(strain_seq_dt.items(), key=lambda x: x[0]) start_flag=0 if core_cutoff==1.0: for ka, va in strain_seq_sorted_lst: if start_flag==0: nuc_array=np.array(np.fromstring(va, dtype='S1')) start_flag=1 else: nuc_array=np.vstack((nuc_array,np.fromstring(va, dtype='S1'))) ## find SNP positions position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis = 0) position_has_gap = np.any(nuc_array=='-', axis=0) position_SNP = position_polymorphic&(~position_has_gap) snp_columns = nuc_array[:,position_SNP] snp_pos_dt[align_file]=np.where(position_SNP)[0] else: ## add '-' for missing genes when dealing with soft core genes core_gene_strain=[ gene for gene in strain_seq_dt.keys()] for strain in totalStrain_sorted_lst: if start_flag==0: if strain in core_gene_strain: nuc_array=np.array(np.fromstring(strain_seq_dt[strain], dtype='S1')) else: print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file) nuc_array=np.array(np.fromstring(missing_gene_seq, dtype='S1')) start_flag=1 else: if strain in core_gene_strain: nuc_array=np.vstack((nuc_array,np.fromstring(strain_seq_dt[strain], dtype='S1'))) else: print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file) nuc_array=np.vstack((nuc_array,np.fromstring(missing_gene_seq, dtype='S1'))) ## find SNP positions ## mask missing genes -- determine rows that have ' ' in every column is_missing = np.all(nuc_array==' ',axis=1) masked_non_missing_array= np.ma.masked_array(nuc_array, nuc_array==' ') position_polymorphic = np.any(masked_non_missing_array!= masked_non_missing_array[0, :],axis = 0) position_has_gap = np.any(masked_non_missing_array=='-',axis=0) position_SNP = position_polymorphic&(~position_has_gap) # the below seems duplicated from 5 lines above?? if is_missing.sum()>0: # with missing genes nuc_array[is_missing]='-' snp_columns = nuc_array[:,position_SNP] snp_pos_dt[align_file]=np.where(position_SNP)[0] #print snp_columns if snp_wh_matrix_flag==0: snp_whole_matrix=snp_columns; snp_wh_matrix_flag=1 else: snp_whole_matrix=np.hstack((snp_whole_matrix, snp_columns)) write_pickle(output_path+'snp_pos.cpk',snp_pos_dt) with open(output_path+'SNP_whole_matrix.aln','wb') as outfile: for ind, isw in enumerate(snp_whole_matrix): write_in_fa( outfile, refSeqList[ind], isw.tostring() )
def output_cutted_clusters(file_path, uncluster_filename, gene_list, cut_branch_threshold, treefile_used=None, cut_leftover=None): """ delete the unclustered file and create new clusters params: gene_list: lists containing the genes in the new split clusters geneCluster_dt: cluster dictionary to be updated cut_leftover: flag to indicate whether there are the leftover nodes after cutting long branches. Default: empty. """ clusterID = uncluster_filename.replace('.fna','') origin_uncluster_nu_fa = uncluster_filename origin_uncluster_aa_fa = uncluster_filename.replace('fna','faa') new_fa_files=set() ## load origin cluster fa files origin_nu_fa_dt = read_fasta(file_path+origin_uncluster_nu_fa) origin_aa_fa_dt = read_fasta(file_path+origin_uncluster_aa_fa) ## split_gene_list has geneSeqID instead of geneID for sgs_index,split_gene_list in enumerate(gene_list,1): if cut_leftover==True: ## newClusterId for the rest genes (_r as identifier) newClusterId="%s_r%s"%(clusterID,sgs_index) else: newClusterId="%s_%s"%(clusterID,sgs_index) #============================================= ## write new divided/split cluster files gene_cluster_nu_filename="%s%s"%(newClusterId,'.fna') gene_cluster_nu_filepath= file_path+gene_cluster_nu_filename gene_cluster_nu_write=open(gene_cluster_nu_filepath , 'wb') gene_cluster_aa_filename="%s%s"%(newClusterId,'.faa') gene_cluster_aa_filepath= file_path+gene_cluster_aa_filename gene_cluster_aa_write=open( file_path+gene_cluster_aa_filename, 'wb') for gene_memb in split_gene_list: if "\\'" in gene_memb: # Replace '\' in node name: ## NC_018495|CM9_RS01675-1-guanosine-3',5'-... in fasta ID ## 'NC_018495|CM9_RS01675-1-guanosine-3\',5\'-...' in nwk node name ## Use origin_nu_fa_dt[gene_memb] will throw the KeyError: ## "NC_018495|CM9_RS01675-1-guanosine-3\\',5\\'" gene_memb=gene_memb.replace("\\'","'") write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb]) write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb]) gene_cluster_nu_write.close(); gene_cluster_aa_write.close(); #============================================= if cut_leftover==True: ## align the rest genes, build tree, cut long branches till nothing can be cutted. cutTree_outputCluster([gene_cluster_nu_filepath],file_path, cut_branch_threshold, treefile_used) else: ## record the misclusters to be deleted (already addressed in cutTree_outputCluster ) ## it will output the same cluster several times #with open(file_path+'old_clusters_longSplit.txt', 'a') as delete_cluster_file: # delete_cluster_file.write('%s\n'%uncluster_filename) ## add record in new_clusters_longSplit.txt, which is used for align new clusters new_fa_files.add(gene_cluster_nu_filepath) ## write cluster statistics in folder update_long_branch_splits addin_geneCluster_dt=defaultdict(list) addin_geneCluster_dt[ newClusterId ] = [0,[],0] ## num_stains addin_geneCluster_dt[ newClusterId ][0]=len(dict(Counter([ ig.split('|')[0] for ig in split_gene_list])).keys()) ## num_genes addin_geneCluster_dt[ newClusterId ][2]=len(dict(Counter([ ig for ig in split_gene_list])).keys()) ## gene members addin_geneCluster_dt[ newClusterId ][1]=[ ig.split('-')[0] for ig in split_gene_list ] ## cPickle new cluster statistics write_pickle(''.join([file_path,'update_long_branch_splits/', newClusterId,'.cpk']),addin_geneCluster_dt) ## write records in gene_diversity file with open(file_path+'new_clusters_longSplit.txt', 'a') as refined_cluster_file: for i in new_fa_files: refined_cluster_file.write('%s\n'%i)
def estimate_core_gene_diversity(path, folders_dict, strain_list, parallel, core_cutoff, factor_core_diversity, species): """ estimate core gene diversity before gene cluster alignment and cluster post-processing """ totalStrain= len(strain_list) ## load clusters clustering_path= folders_dict['clustering_path'] geneCluster_dt= load_pickle(clustering_path+'allclusters.cpk') protein_path= folders_dict['protein_path'] nucleotide_path= folders_dict['nucleotide_path'] protein_dict_path= '%s%s'%(protein_path,'all_protein_seq.cpk') nucleotide_dict_path= '%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk') tmp_core_seq_path= '%s%s'%(clustering_path,'tmp_core/') ## load geneID_to_geneSeqID geneSeqID cpk file geneID_to_geneSeqID_dict= load_pickle(path+'geneID_to_geneSeqID.cpk') ## create core gene list core_geneCluster_dt= defaultdict() # geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes } for clusterID, cluster_stats in geneCluster_dt.iteritems(): if core_cutoff==1.0: strain_core_cutoff=totalStrain else: strain_core_cutoff=int(totalStrain*core_cutoff) ## check whether #genes == #strains and it's a core/soft-core gene if cluster_stats[0]==cluster_stats[2] and cluster_stats[0]>=strain_core_cutoff: core_geneCluster_dt[clusterID]=cluster_stats if os.path.exists(tmp_core_seq_path): os.system(''.join(['rm -rf ',tmp_core_seq_path])) os.system('mkdir %s'%tmp_core_seq_path) ## create dict storing all genes' translation if 0: gene_aa_dict= defaultdict(dict) for accession_id in strain_list: gene_aa_dict[accession_id]= read_fasta(''.join([protein_path,accession_id,'.faa'])) write_pickle(protein_dict_path, gene_aa_dict) ## create dict for all gene's nucleotide sequence gene_na_dict= defaultdict(dict) for accession_id in strain_list: gene_na_dict[accession_id]=read_fasta(''.join([nucleotide_path,accession_id,'.fna'])) write_pickle(nucleotide_dict_path, gene_na_dict) gene_aa_dict= load_pickle(protein_dict_path) gene_na_dict= load_pickle(nucleotide_dict_path) ## write nucleotide and amino-acid sequences for each gene cluster export_cluster_seq_tmp(tmp_core_seq_path, core_geneCluster_dt, geneID_to_geneSeqID_dict, gene_na_dict, gene_aa_dict) tmp_fa_files=glob.glob(tmp_core_seq_path+"*.fna") multips(calculate_diversity, parallel, tmp_fa_files, tmp_core_seq_path, species) calculated_core_diversity=tmp_average_core_diversity(tmp_core_seq_path) refined_core_diversity= round((0.1+factor_core_diversity*calculated_core_diversity)/(1+factor_core_diversity*calculated_core_diversity),4) print('factor used: '+str(factor_core_diversity)) print('average core genome diversity: '+str(calculated_core_diversity)) print('defined core genome diversity cutoff for splitting long branches: '+str(refined_core_diversity)) ## move folder tmp_core to the central data folder new_clustering_path= '%stmp_core'%path if os.path.exists(new_clustering_path): os.system(''.join(['rm -r ',new_clustering_path])) os.system('mv %s %s'%(tmp_core_seq_path, path)) return calculated_core_diversity, refined_core_diversity
def update_geneCluster_cpk(path, geneCluster_dt): ## update gene cluster pickled file cluster_path = path+'protein_faa/diamond_matches/' write_pickle(cluster_path+'allclusters_postprocessed.cpk',geneCluster_dt)
def export_gain_loss(tree, path, merged_gain_loss_output): ''' ''' # write final tree with internal node names as assigned by treetime sep = '/' output_path = sep.join([path.rstrip(sep), 'geneCluster/']) events_dict_path = sep.join([output_path, 'dt_geneEvents.cpk']) gene_pattern_dict_path = sep.join([output_path, 'dt_genePattern.cpk']) tree_fname = sep.join([output_path, 'strain_tree.nwk']) Phylo.write(tree.tree, tree_fname, 'newick') gene_gain_loss_dict = defaultdict(str) preorder_strain_list = [] #store the preorder nodes as strain list for node in tree.tree.find_clades( order='preorder'): # order does not matter much here if node.up is None: continue #print(node.name ,len(node.geneevents),node.geneevents) gain_loss = [ str(int(ancestral) * 2 + int(derived)) for ancestral, derived in zip(node.up.genepresence, node.genepresence) ] gene_gain_loss_dict[node.name] = "".join(gain_loss) preorder_strain_list.append(node.name) gain_loss_array = np.array( [[i for i in gain_loss_str] for gain_loss_str in gene_gain_loss_dict.values()], dtype=int) # 1 and 2 are codes for gain/loss events events_array = ((gain_loss_array == 1) | (gain_loss_array == 2)).sum(axis=0) events_dict = {index: event for index, event in enumerate(events_array)} write_pickle(events_dict_path, events_dict) if merged_gain_loss_output: ## export gene loss dict to json for visualization #gene_loss_fname = sep.join([ output_path, 'geneGainLossEvent.json']) #write_json(gene_gain_loss_dict, gene_loss_fname, indent=1) write_pickle(gene_pattern_dict_path, gene_gain_loss_dict) else: ## strainID as key, presence pattern as value (converted into np.array) sorted_genelist = load_sorted_clusters(path) strainID_keymap = { ind: k for ind, k in enumerate(preorder_strain_list) } #presence_arr= np.array([ np.fromstring(gene_gain_loss_dict[k], np.int8)-48 for k in preorder_strain_list]) presence_arr = np.array([ np.array(gene_gain_loss_dict[k], 'c') for k in preorder_strain_list ]) ## if true, write pattern dict instead of pattern string in a json file pattern_json_flag = False for ind, (clusterID, gene) in enumerate(sorted_genelist): pattern_fname = '%s%s_patterns.json' % (output_path, clusterID) if pattern_json_flag: pattern_dt = { strainID_keymap[strain_ind]: str(patt) for strain_ind, patt in enumerate(presence_arr[:, ind]) } write_json(pattern_dt, pattern_fname, indent=1) #print(preorder_strain_list,clusterID) #print(''.join([ str(patt) for patt in presence_arr[:, ind]])) with open(pattern_fname, 'w') as write_pattern: write_pattern.write( '{"patterns":"' + ''.join([str(patt) for patt in presence_arr[:, ind]]) + '"}')
def create_core_SNP_matrix(path, core_cutoff=1.0, core_gene_strain_fpath=''): #1.0 """ create SNP matrix using core gene SNPs input: strain_list.cpk, core_geneList.cpk output: SNP_whole_matrix.aln core_cutoff: percentage of strains used to decide whether a gene is core default: 1.0 (strictly core gene, which is present in all strains) customized: 0.9 ( soft core, considered as core if present in 90% of strains) """ import os, sys, operator import numpy as np import numpy.ma as ma from collections import defaultdict from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa alnFilePath = '%s%s' % (path, 'geneCluster/') output_path = alnFilePath ## create core gene list corelist = [] strain_list = load_pickle(path + 'strain_list.cpk') totalStrain = len(strain_list) sorted_geneList = load_sorted_clusters(path) if core_gene_strain_fpath != '': with open(core_gene_strain_fpath, 'rb') as core_gene_strain_file: core_strain_set = set( [i.rstrip().replace('-', '_') for i in core_gene_strain_file]) with open(output_path + 'core_geneList.txt', 'wb') as outfile: for clusterID, vg in sorted_geneList: if core_cutoff == 1.0: strain_core_cutoff = totalStrain else: strain_core_cutoff = int(totalStrain * core_cutoff) if vg[0] == vg[2] and vg[0] >= strain_core_cutoff: coreGeneName = '%s%s' % (clusterID, '_na_aln.fa') ## sequences might be discarded because of premature stops coreGeneName_path = alnFilePath + coreGeneName if os.path.exists(coreGeneName_path) and len( read_fasta(coreGeneName_path)) >= strain_core_cutoff: if core_gene_strain_fpath != '' and len( core_strain_set - set([i.split('|')[0] for i in vg[1]])) != 0: continue outfile.write(coreGeneName + '\n') corelist.append(coreGeneName) else: #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene') pass write_pickle(output_path + 'core_geneList.cpk', corelist) refSeqList = load_pickle(path + 'strain_list.cpk') refSeqList.sort() snp_fre_lst = [] snp_wh_matrix_flag = 0 snp_pos_dt = defaultdict(list) snp_whole_matrix = np.array([]) snps_by_gene = [] for align_file in corelist: ## core genes nuc_array = np.array([]) # array to store nucleotides for each gene gene_seq_dt = read_fasta(alnFilePath + align_file) if core_cutoff != 1.0: # set sequences for missing gene (space*gene_length) missing_gene_seq = ' ' * len(gene_seq_dt.values()[0]) totalStrain_sorted_lst = sorted(strain_list) # build strain_seq_dt from gene_seq_dt strain_seq_dt = defaultdict() for gene, seq in gene_seq_dt.iteritems(): strain_seq_dt[gene.split('-')[0]] = seq # strain-locus_tag-... strain_seq_sorted_lst = sorted(strain_seq_dt.items(), key=lambda x: x[0]) start_flag = 0 if core_cutoff == 1.0: for ka, va in strain_seq_sorted_lst: if start_flag == 0: nuc_array = np.array(np.fromstring(va, dtype='S1')) start_flag = 1 else: nuc_array = np.vstack( (nuc_array, np.fromstring(va, dtype='S1'))) ## find SNP positions position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis=0) position_has_gap = np.any(nuc_array == '-', axis=0) position_SNP = position_polymorphic & (~position_has_gap) snp_columns = nuc_array[:, position_SNP] snp_pos_dt[align_file] = np.where(position_SNP)[0] else: ## add '-' for missing genes when dealing with soft core genes core_gene_strain = [gene for gene in strain_seq_dt.keys()] for strain in totalStrain_sorted_lst: if start_flag == 0: if strain in core_gene_strain: nuc_array = np.array( np.fromstring(strain_seq_dt[strain], dtype='S1')) else: print 'Soft core gene: gene absent in strain %s on cluster %s' % ( strain, align_file) nuc_array = np.array( np.fromstring(missing_gene_seq, dtype='S1')) start_flag = 1 else: if strain in core_gene_strain: nuc_array = np.vstack( (nuc_array, np.fromstring(strain_seq_dt[strain], dtype='S1'))) else: print 'Soft core gene: gene absent in strain %s on cluster %s' % ( strain, align_file) nuc_array = np.vstack((nuc_array, np.fromstring(missing_gene_seq, dtype='S1'))) ## find SNP positions ## mask missing genes -- determine rows that have ' ' in every column is_missing = np.all(nuc_array == ' ', axis=1) masked_non_missing_array = np.ma.masked_array( nuc_array, nuc_array == ' ') position_polymorphic = np.any( masked_non_missing_array != masked_non_missing_array[0, :], axis=0) position_has_gap = np.any(masked_non_missing_array == '-', axis=0) position_SNP = position_polymorphic & (~position_has_gap) # the below seems duplicated from 5 lines above?? if is_missing.sum() > 0: # with missing genes nuc_array[is_missing] = '-' snp_columns = nuc_array[:, position_SNP] snp_pos_dt[align_file] = np.where(position_SNP)[0] #print snp_columns if snp_wh_matrix_flag == 0: snp_whole_matrix = snp_columns snp_wh_matrix_flag = 1 else: snp_whole_matrix = np.hstack((snp_whole_matrix, snp_columns)) write_pickle(output_path + 'snp_pos.cpk', snp_pos_dt) with open(output_path + 'SNP_whole_matrix.aln', 'wb') as outfile: for ind, isw in enumerate(snp_whole_matrix): write_in_fa(outfile, refSeqList[ind], isw.tostring())
def update_diversity_cpk(path): ## write gene_diversity_Dt cpk file output_path = path+'geneCluster/' with open(output_path+'gene_diversity.txt', 'rb') as infile: write_pickle(output_path+'gene_diversity.cpk',{ i.rstrip().split('\t')[0]:i.rstrip().split('\t')[1] for i in infile})
def update_geneCluster_cpk(path, geneCluster_dt): ## update gene cluster pickled file cluster_path = path+'protein_faa/diamond_matches/' write_pickle(cluster_path+'allclusters_postprocessed.cpk',geneCluster_dt)
def extract_sequences(path, strain_list, folders_dict, gbk_present, enable_RNA_clustering): ''' go through all GenBank files and extract sequences and metadata for each one ''' gbk_path= folders_dict['gbk_path'] protein_path= folders_dict['protein_path'] nucleotide_path= folders_dict['nucleotide_path'] RNA_path= folders_dict['RNA_path'] geneID_to_geneSeqID_file= '%sgeneID_to_geneSeqID.cpk'%path geneID_to_description_file= '%sgeneID_to_description.cpk'%path RNAID_to_SeqID_file= '%sRNAID_to_SeqID.cpk'%path RNAID_to_description_file= '%sRNAID_to_description.cpk'%path protein_dict_path= '%s%s'%(protein_path,'all_protein_seq.cpk') nucleotide_dict_path= '%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk') RNA_dict_path= '%s%s'%(RNA_path,'all_RNA_seq.cpk') geneID_to_geneSeqID_dict= defaultdict() geneID_to_description_dict= defaultdict() RNAID_to_SeqID_dict= defaultdict() RNAID_to_description_dict= defaultdict() gene_aa_dict= defaultdict(dict) gene_na_dict= defaultdict(dict) RNA_dict= defaultdict(dict) if gbk_present: ## clean up folder when data from previous run exist. os.system('rm -rf '+protein_path+'*.faa') os.system('rm -rf '+nucleotide_path+'*.fna') missing_CDS_list=[] ## a list containing strains which have no CDS (if any) ## process gbk file for strainID in strain_list: gbk_fname= ''.join([gbk_path,strainID,'.gbk']) protein_fname= ''.join([protein_path,strainID,'.faa']) nucleotide_fname= ''.join([nucleotide_path,strainID,'.fna']) RNA_fname= ''.join([RNA_path,strainID,'.fna']) check_CDS_passed= gbk_translation(strainID, gbk_fname, protein_fname, nucleotide_fname, RNA_fname, geneID_to_geneSeqID_dict,geneID_to_description_dict, RNAID_to_SeqID_dict, RNAID_to_description_dict, gene_aa_dict, gene_na_dict, RNA_dict, enable_RNA_clustering) if not check_CDS_passed: missing_CDS_list.append(strainID) if len(missing_CDS_list)!=0: print 'Warning: no CDS found in the following genome/genomes, please double-check\n', missing_CDS_list exit() else: ## process fna/faa files if gbk files are not given. for strainID in strain_list: ## amino acid sequences protein_fname=''.join([protein_path,strainID,'.faa']) nucleotide_fname=''.join([nucleotide_path,strainID,'.fna']) aa_sequence_dt=read_fasta(protein_fname) na_sequence_dt=read_fasta(nucleotide_fname) ## prepare geneSeqID and description for geneID in aa_sequence_dt.keys(): geneName, annotation= '','' geneID_to_geneSeqID_dict[geneID]=geneID geneID_to_description_dict[geneID]={'geneName': geneName, 'annotation': annotation} gene_aa_dict[strainID][geneID]=aa_sequence_dt[geneID] gene_na_dict[strainID][geneID]=na_sequence_dt[geneID] write_pickle(geneID_to_geneSeqID_file, geneID_to_geneSeqID_dict) write_pickle(geneID_to_description_file, geneID_to_description_dict) write_pickle(protein_dict_path,gene_aa_dict) write_pickle(nucleotide_dict_path,gene_na_dict) ## option: process RNA sequences for RNA_clustering if enable_RNA_clustering: write_pickle(RNA_dict_path,RNA_dict) write_pickle(RNAID_to_SeqID_file, RNAID_to_SeqID_dict) write_pickle(RNAID_to_description_file, RNAID_to_description_dict) return gene_aa_dict, gene_na_dict
def output_cutted_clusters(file_path, uncluster_filename, gene_list, cut_branch_threshold, treefile_used=None, cut_leftover=None): """ delete the unclustered file and create new clusters params: gene_list: lists containing the genes in the new split clusters geneCluster_dt: cluster dictionary to be updated cut_leftover: flag to indicate whether there are the leftover nodes after cutting long branches. Default: empty. """ clusterID = uncluster_filename.replace('.fna', '') origin_uncluster_nu_fa = uncluster_filename origin_uncluster_aa_fa = uncluster_filename.replace('fna', 'faa') new_fa_files = set() ## load origin cluster fa files origin_nu_fa_dt = read_fasta(file_path + origin_uncluster_nu_fa) origin_aa_fa_dt = read_fasta(file_path + origin_uncluster_aa_fa) ## split_gene_list has geneSeqID instead of geneID for sgs_index, split_gene_list in enumerate(gene_list, 1): if cut_leftover == True: ## newClusterId for the rest genes (_r as identifier) newClusterId = "%s_r%s" % (clusterID, sgs_index) else: newClusterId = "%s_%s" % (clusterID, sgs_index) #============================================= ## write new divided/split cluster files gene_cluster_nu_filename = "%s%s" % (newClusterId, '.fna') gene_cluster_nu_filepath = file_path + gene_cluster_nu_filename gene_cluster_nu_write = open(gene_cluster_nu_filepath, 'wb') gene_cluster_aa_filename = "%s%s" % (newClusterId, '.faa') gene_cluster_aa_filepath = file_path + gene_cluster_aa_filename gene_cluster_aa_write = open(file_path + gene_cluster_aa_filename, 'wb') for gene_memb in split_gene_list: if "\\'" in gene_memb: # Replace '\' in node name: ## NC_018495|CM9_RS01675-1-guanosine-3',5'-... in fasta ID ## 'NC_018495|CM9_RS01675-1-guanosine-3\',5\'-...' in nwk node name ## Use origin_nu_fa_dt[gene_memb] will throw the KeyError: ## "NC_018495|CM9_RS01675-1-guanosine-3\\',5\\'" gene_memb = gene_memb.replace("\\'", "'") write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb]) write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb]) gene_cluster_nu_write.close() gene_cluster_aa_write.close() #============================================= if cut_leftover == True: ## align the rest genes, build tree, cut long branches till nothing can be cutted. cutTree_outputCluster([gene_cluster_nu_filepath], file_path, cut_branch_threshold, treefile_used) else: ## record the misclusters to be deleted (already addressed in cutTree_outputCluster ) ## it will output the same cluster several times #with open(file_path+'old_clusters_longSplit.txt', 'a') as delete_cluster_file: # delete_cluster_file.write('%s\n'%uncluster_filename) ## add record in new_clusters_longSplit.txt, which is used for align new clusters new_fa_files.add(gene_cluster_nu_filepath) ## write cluster statistics in folder update_long_branch_splits addin_geneCluster_dt = defaultdict(list) addin_geneCluster_dt[newClusterId] = [0, [], 0] ## num_stains addin_geneCluster_dt[newClusterId][0] = len( dict(Counter([ig.split('|')[0] for ig in split_gene_list])).keys()) ## num_genes addin_geneCluster_dt[newClusterId][2] = len( dict(Counter([ig for ig in split_gene_list])).keys()) ## gene members addin_geneCluster_dt[newClusterId][1] = [ ig.split('-')[0] for ig in split_gene_list ] ## cPickle new cluster statistics write_pickle( ''.join([ file_path, 'update_long_branch_splits/', newClusterId, '.cpk' ]), addin_geneCluster_dt) ## write records in gene_diversity file with open(file_path + 'new_clusters_longSplit.txt', 'a') as refined_cluster_file: for i in new_fa_files: refined_cluster_file.write('%s\n' % i)
def estimate_core_gene_diversity(path, folders_dict, strain_list, parallel, core_cutoff, factor_core_diversity, species): """ estimate core gene diversity before gene cluster alignment and cluster post-processing """ totalStrain = len(strain_list) ## load clusters clustering_path = folders_dict['clustering_path'] geneCluster_dt = load_pickle(clustering_path + 'allclusters.cpk') protein_path = folders_dict['protein_path'] nucleotide_path = folders_dict['nucleotide_path'] protein_dict_path = '%s%s' % (protein_path, 'all_protein_seq.cpk') nucleotide_dict_path = '%s%s' % (nucleotide_path, 'all_nucleotide_seq.cpk') tmp_core_seq_path = '%s%s' % (clustering_path, 'tmp_core/') ## load geneID_to_geneSeqID geneSeqID cpk file geneID_to_geneSeqID_dict = load_pickle(path + 'geneID_to_geneSeqID.cpk') ## create core gene list core_geneCluster_dt = defaultdict() # geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes } for clusterID, cluster_stats in geneCluster_dt.iteritems(): if core_cutoff == 1.0: strain_core_cutoff = totalStrain else: strain_core_cutoff = int(totalStrain * core_cutoff) ## check whether #genes == #strains and it's a core/soft-core gene if cluster_stats[0] == cluster_stats[ 2] and cluster_stats[0] >= strain_core_cutoff: core_geneCluster_dt[clusterID] = cluster_stats if os.path.exists(tmp_core_seq_path): os.system(''.join(['rm -rf ', tmp_core_seq_path])) os.system('mkdir %s' % tmp_core_seq_path) ## create dict storing all genes' translation if 0: gene_aa_dict = defaultdict(dict) for accession_id in strain_list: gene_aa_dict[accession_id] = read_fasta(''.join( [protein_path, accession_id, '.faa'])) write_pickle(protein_dict_path, gene_aa_dict) ## create dict for all gene's nucleotide sequence gene_na_dict = defaultdict(dict) for accession_id in strain_list: gene_na_dict[accession_id] = read_fasta(''.join( [nucleotide_path, accession_id, '.fna'])) write_pickle(nucleotide_dict_path, gene_na_dict) gene_aa_dict = load_pickle(protein_dict_path) gene_na_dict = load_pickle(nucleotide_dict_path) ## write nucleotide and amino-acid sequences for each gene cluster export_cluster_seq_tmp(tmp_core_seq_path, core_geneCluster_dt, geneID_to_geneSeqID_dict, gene_na_dict, gene_aa_dict) tmp_fa_files = glob.glob(tmp_core_seq_path + "*.fna") multips(calculate_diversity, parallel, tmp_fa_files, tmp_core_seq_path, species) calculated_core_diversity = tmp_average_core_diversity(tmp_core_seq_path) refined_core_diversity = round( (0.1 + factor_core_diversity * calculated_core_diversity) / (1 + factor_core_diversity * calculated_core_diversity), 4) print('factor used: ' + str(factor_core_diversity)) print('average core genome diversity: ' + str(calculated_core_diversity)) print( 'defined core genome diversity cutoff for splitting long branches: ' + str(refined_core_diversity)) ## move folder tmp_core to the central data folder new_clustering_path = '%stmp_core' % path if os.path.exists(new_clustering_path): os.system(''.join(['rm -r ', new_clustering_path])) os.system('mv %s %s' % (tmp_core_seq_path, path)) return calculated_core_diversity, refined_core_diversity