def make_genepresence_alignment(path, disable_gain_loss, merged_gain_loss_output): ''' loop over all gene clusters and append 0/1 to strain specific string used as pseudo alignment of gene presence absence ''' geneClusterPath = '%s%s' % (path, 'protein_fna/diamond_matches/') output_path = '%s%s' % (path, 'geneCluster/') ## load strain list and prepare for gene presence/absence strain_list = load_pickle('%s%s' % (path, 'strain_list.cpk')) set_totalStrain = set([istrain for istrain in strain_list]) totalStrain = len(set_totalStrain) dt_strainGene = defaultdict(str) sorted_genelist = load_sorted_clusters(path) ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for clusterID, gene in sorted_genelist: ## append 0/1 to each strain create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene[1]) with open('%s%s' % (output_path, 'genePresence.aln'), 'wb') as presence_outfile: for istkey in dt_strainGene: write_in_fa(presence_outfile, istkey, dt_strainGene[istkey]) write_pickle('%s%s' % (output_path, 'dt_genePresence.cpk'), dt_strainGene) if disable_gain_loss: geneEvents_dt = {i: 0 for i in range(len(sorted_genelist))} write_pickle('%s%s' % (output_path, 'dt_geneEvents.cpk'), geneEvents_dt) if merged_gain_loss_output: gene_loss_fname = '%s%s' % (output_path, 'geneGainLossEvent.json') write_json(dt_strainGene, gene_loss_fname, indent=1) else: ## strainID as key, presence pattern as value (converted into np.array) keylist = dt_strainGene.keys() keylist.sort() strainID_keymap = {ind: k for ind, k in enumerate(keylist) } # dict(zip(keylist, range(3))) presence_arr = np.array([ np.array(dt_strainGene[k], 'c') for k in keylist ]) # 0: present, 3: absent presence_arr[presence_arr == '1'] = '3' for ind, (clusterID, gene) in enumerate(sorted_genelist): pattern_dt = { strainID_keymap[strain_ind]: str(patt) for strain_ind, patt in enumerate(presence_arr[:, ind]) } pattern_fname = '%s%s_patterns.json' % (output_path, clusterID) write_json(pattern_dt, pattern_fname, indent=1)
def make_genepresence_alignment(path, disable_gain_loss, merged_gain_loss_output): ''' loop over all gene clusters and append 0/1 to strain specific string used as pseudo alignment of gene presence absence ''' geneClusterPath='%s%s'%(path,'protein_fna/diamond_matches/') output_path='%s%s'%(path,'geneCluster/'); ## load strain list and prepare for gene presence/absence strain_list= load_pickle('%s%s'%(path,'strain_list.cpk')) set_totalStrain=set([ istrain for istrain in strain_list ]) totalStrain=len(set_totalStrain) dt_strainGene= defaultdict(str) sorted_genelist = load_sorted_clusters(path) ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for clusterID, gene in sorted_genelist: ## append 0/1 to each strain create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene[1]) with open('%s%s'%(output_path,'genePresence.aln'),'wb') as presence_outfile: for istkey in dt_strainGene: write_in_fa( presence_outfile, istkey, dt_strainGene[istkey]) write_pickle('%s%s'%(output_path,'dt_genePresence.cpk'), dt_strainGene) if disable_gain_loss: geneEvents_dt={ i:0 for i in range(len(sorted_genelist)) } write_pickle('%s%s'%(output_path,'dt_geneEvents.cpk'), geneEvents_dt) if merged_gain_loss_output: gene_loss_fname='%s%s'%(output_path,'geneGainLossEvent.json') write_json(dt_strainGene, gene_loss_fname, indent=1) else: ## strainID as key, presence pattern as value (converted into np.array) keylist= dt_strainGene.keys(); keylist.sort() strainID_keymap= {ind:k for ind, k in enumerate(keylist)} # dict(zip(keylist, range(3))) presence_arr= np.array([ np.array(dt_strainGene[k],'c') for k in keylist]) # 0: present, 3: absent presence_arr[presence_arr=='1']='3' for ind, (clusterID, gene) in enumerate(sorted_genelist): pattern_dt= { strainID_keymap[strain_ind]:str(patt) for strain_ind, patt in enumerate(presence_arr[:, ind])} pattern_fname='%s%s_patterns.json'%(output_path,clusterID) write_json(pattern_dt, pattern_fname, indent=1)
def export(self, path = '', extra_attr = ['aa_muts','annotation','branch_length','name','accession'], RNA_specific=False): ## write tree Phylo.write(self.tree, path+self.clusterID+'.nwk', 'newick') ## processing node name for node in self.tree.get_terminals(): #node.name = node.ann.split('|')[0] node.accession = node.ann.split('|')[0] #node.longName = node.ann.split('-')[0] node.name = node.ann.split('-')[0] #NZ_CP008870|HV97_RS21955-1-fabG_3-ketoacyl-ACP_reductase annotation=node.ann.split('-',2) if len(annotation)==3: node.annotation= annotation[2] else: node.annotation= annotation[0] ## write tree json for n in self.tree.root.find_clades(): if n.branch_length<1e-6: n.branch_length = 1e-6 timetree_fname = path+self.clusterID+'_tree.json' tree_json = tree_to_json(self.tree.root, extra_attr=extra_attr) write_json(tree_json, timetree_fname, indent=None) self.reduce_alignments(RNA_specific) ## msa compatible for i_aln in self.aln: i_aln.id=i_aln.id.replace('|','-',1) for i_alnr in self.aln_reduced: i_alnr.id=i_alnr.id.replace('|','-',1) AlignIO.write(self.aln, path+self.clusterID+'_na_aln.fa', 'fasta') AlignIO.write(self.aln_reduced, path+self.clusterID+'_na_aln_reduced.fa', 'fasta') if RNA_specific==False: for i_aa_aln in self.aa_aln: i_aa_aln.id=i_aa_aln.id.replace('|','-',1) for i_aa_alnr in self.aa_aln_reduced: i_aa_alnr.id=i_aa_alnr.id.replace('|','-',1) AlignIO.write(self.aa_aln, path+self.clusterID+'_aa_aln.fa', 'fasta') AlignIO.write(self.aa_aln_reduced, path+self.clusterID+'_aa_aln_reduced.fa', 'fasta') ## write seq json write_seq_json=0 if write_seq_json: elems = {} for node in self.tree.find_clades(): if hasattr(node, "sequence"): if hasattr(node, "longName")==False: node.longName=node.name elems[node.longName] = {} nuc_dt= {pos:state for pos, (state, ancstate) in enumerate(izip(node.sequence.tostring(), self.tree.root.sequence.tostring())) if state!=ancstate} nodeseq=node.sequence.tostring();nodeseq_len=len(nodeseq) elems[node.longName]['nuc']=nuc_dt elems['root'] = {} elems['root']['nuc'] = self.tree.root.sequence.tostring() self.sequences_fname=path+self.clusterID+'_seq.json' write_json(elems, self.sequences_fname, indent=None)
def export_gain_loss(tree, path, merged_gain_loss_output): ''' ''' # write final tree with internal node names as assigned by treetime sep = '/' output_path = sep.join([path.rstrip(sep), 'geneCluster/']) events_dict_path = sep.join([output_path, 'dt_geneEvents.cpk']) gene_pattern_dict_path = sep.join([output_path, 'dt_genePattern.cpk']) tree_fname = sep.join([output_path, 'strain_tree.nwk']) Phylo.write(tree.tree, tree_fname, 'newick') gene_gain_loss_dict = defaultdict(str) preorder_strain_list = [] #store the preorder nodes as strain list for node in tree.tree.find_clades( order='preorder'): # order does not matter much here if node.up is None: continue #print(node.name ,len(node.geneevents),node.geneevents) gain_loss = [ str(int(ancestral) * 2 + int(derived)) for ancestral, derived in zip(node.up.genepresence, node.genepresence) ] gene_gain_loss_dict[node.name] = "".join(gain_loss) preorder_strain_list.append(node.name) gain_loss_array = np.array( [[i for i in gain_loss_str] for gain_loss_str in gene_gain_loss_dict.values()], dtype=int) # 1 and 2 are codes for gain/loss events events_array = ((gain_loss_array == 1) | (gain_loss_array == 2)).sum(axis=0) events_dict = {index: event for index, event in enumerate(events_array)} write_pickle(events_dict_path, events_dict) if merged_gain_loss_output: ## export gene loss dict to json for visualization #gene_loss_fname = sep.join([ output_path, 'geneGainLossEvent.json']) #write_json(gene_gain_loss_dict, gene_loss_fname, indent=1) write_pickle(gene_pattern_dict_path, gene_gain_loss_dict) else: ## strainID as key, presence pattern as value (converted into np.array) sorted_genelist = load_sorted_clusters(path) strainID_keymap = { ind: k for ind, k in enumerate(preorder_strain_list) } #presence_arr= np.array([ np.fromstring(gene_gain_loss_dict[k], np.int8)-48 for k in preorder_strain_list]) presence_arr = np.array([ np.array(gene_gain_loss_dict[k], 'c') for k in preorder_strain_list ]) ## if true, write pattern dict instead of pattern string in a json file pattern_json_flag = False for ind, (clusterID, gene) in enumerate(sorted_genelist): pattern_fname = '%s%s_patterns.json' % (output_path, clusterID) if pattern_json_flag: pattern_dt = { strainID_keymap[strain_ind]: str(patt) for strain_ind, patt in enumerate(presence_arr[:, ind]) } write_json(pattern_dt, pattern_fname, indent=1) #print(preorder_strain_list,clusterID) #print(''.join([ str(patt) for patt in presence_arr[:, ind]])) with open(pattern_fname, 'w') as write_pattern: write_pattern.write( '{"patterns":"' + ''.join([str(patt) for patt in presence_arr[:, ind]]) + '"}')