def make_genepresence_alignment(path): ''' loop over all gene clusters and append 0/1 to strain specific string used as pseudo alignment of gene presence absence ''' geneClusterPath = '%s%s' % (path, 'protein_fna/diamond_matches/') output_path = '%s%s' % (path, 'geneCluster/') ## load strain list and prepare for gene presence/absence strain_list = load_pickle(path + 'strain_list.cpk') set_totalStrain = set([istrain for istrain in strain_list]) totalStrain = len(set_totalStrain) dt_strainGene = defaultdict(list) sorted_genelist = load_sorted_clusters(path) ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for gid, (clusterID, gene) in enumerate(sorted_genelist): gene_list = gene[1] ## append 0/1 to each strain dt_strainGene = create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene_list) with open(output_path + 'genePresence.aln', 'wb') as presence_outfile: for istkey in dt_strainGene: dt_strainGene[istkey] = ''.join(dt_strainGene[istkey]) write_in_fa(presence_outfile, istkey, dt_strainGene[istkey]) write_pickle(output_path + 'dt_genePresence.cpk', dt_strainGene)
def create_split_cluster_files(file_path, fname, gene_list1, gene_list2, diamond_geneCluster_dt): """ delete the old cluster and create two new clusters params: new_fa_files: list to which new file names are appeneded gene_list1/2: lists containing the genes in the new split clusters diamond_geneCluster_dt: cluster dictionary to be updated """ orgin_nwk_name = fname.split('/')[-1] clusterID = orgin_nwk_name.replace('.nwk','') origin_cluster_nu_fa = orgin_nwk_name.replace('nwk','fna') origin_cluster_aa_fa = orgin_nwk_name.replace('nwk','faa') split_fa_files_set=set() #print 'xxxx', clusterID try: print('deleting:',orgin_nwk_name,gene_list1,gene_list2, clusterID) del diamond_geneCluster_dt[clusterID] except: print("can't delete",orgin_nwk_name,gene_list1,gene_list2, clusterID) ## write new cluster fa files origin_nu_fa_dt = read_fasta(file_path+origin_cluster_nu_fa) origin_aa_fa_dt = read_fasta(file_path+origin_cluster_aa_fa) sgs_index=0 ## split_gene_list has geneSeqID instead of geneID for split_gene_list in (list(gene_list1), list(gene_list2)): sgs_index+=1 newClusterId="%s_%s"%(clusterID,sgs_index) gene_cluster_nu_filename="%s%s"%(newClusterId,'.fna') gene_cluster_aa_filename="%s%s"%(newClusterId,'.faa') gene_cluster_nu_write=open( file_path+gene_cluster_nu_filename, 'wb') gene_cluster_aa_write=open( file_path+gene_cluster_aa_filename, 'wb') split_fa_files_set |= set([file_path+gene_cluster_nu_filename]) ## write new split cluster files for gene_memb in split_gene_list: write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb]) write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb]) gene_cluster_nu_write.close(); gene_cluster_aa_write.close(); diamond_geneCluster_dt[ newClusterId ] = [0,[],0] ## num_stains diamond_geneCluster_dt[ newClusterId ][0]=len(dict(Counter([ ig.split('|')[0] for ig in split_gene_list])).keys()) ## num_genes diamond_geneCluster_dt[ newClusterId ][2]=len(dict(Counter([ ig for ig in split_gene_list])).keys()) ## gene members diamond_geneCluster_dt[ newClusterId ][1]=[ ig.split('-')[0] for ig in split_gene_list ] return split_fa_files_set
def create_geneCluster_fa(): """ dict storing amino_acid Id/Seq from '.faa' files input: '.faa', '_gene_nuc_dict.cpk', '-orthamcl-allclusters.cpk' output: """ ## make sure the geneCluster folder is empty if os.path.isdir(path+'geneCluster/')==True: print 'remove previous folder: ',path+'geneCluster/' os.system('rm -rf %s'%(path+'geneCluster/')) faa_path=path+'protein_faa/' ## dict storing all genes' translation gene_aa_dict=defaultdict(list) for ifaa in glob.glob(faa_path+"*.faa"): gene_aa_dict.update(read_fasta(ifaa)) ## dict storing nucleotide Id/Seq from '_gene_nuc_dict.cpk' files istrain_cpk={}; strain_list= load_pickle(path+'strain_list.cpk'); nucleotide_dict_path= '%s%s'%(path,'nucleotide_fna/') for istrain in strain_list: istrain_cpk[istrain]=load_pickle(nucleotide_dict_path+istrain+'_gene_nuc_dict.cpk') ## load gene cluster cpk file geneCluster_path=faa_path+'diamond_matches/' diamond_geneCluster_dt=load_pickle(geneCluster_path+'orthamcl-allclusters.cpk') ## load geneID_to_geneSeqID geneSeqID cpk file geneID_to_geneSeqID_dict=load_pickle(path+'geneID_to_geneSeqID.cpk') ## create cluster-genes fasta files fasta_path=path+'geneCluster/'; os.system('mkdir '+fasta_path) ## diamond_geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes } for clusterID, gene in diamond_geneCluster_dt.iteritems(): ## geneCluster file name gene_cluster_nu_filename="%s%s"%(clusterID,'.fna') gene_cluster_aa_filename="%s%s"%(clusterID,'.faa') gene_cluster_nu_write=open( fasta_path+gene_cluster_nu_filename, 'wb') gene_cluster_aa_write=open( fasta_path+gene_cluster_aa_filename, 'wb') ## write nucleotide/amino_acid sequences into geneCluster files for gene_memb in gene[1]: ## gene_name format: strain_1|locusTag strain_name= gene_memb.split('|')[0] gene_memb_seq=str(istrain_cpk[strain_name][gene_memb]) geneSeqID=geneID_to_geneSeqID_dict[gene_memb] write_in_fa(gene_cluster_nu_write, geneSeqID, gene_memb_seq ) write_in_fa(gene_cluster_aa_write,geneSeqID, gene_aa_dict[gene_memb]) gene_cluster_nu_write.close(); gene_cluster_aa_write.close();
def create_RNACluster_fa(path): """ input: '.fna', '_RNA_nuc_dict.cpk', '-orthamcl-allclusters.cpk' output: '.aln', 'tree.json', etc """ if 0: ## make sure the RNACluster folder is empty if os.path.isdir(path + 'RNACluster/') == True: print 'remove previous folder: ', path + 'RNACluster/' os.system('rm -rf %s' % (path + 'RNACluster/')) ## dict storing nucleotide Id/Seq from '_RNA_nuc_dict.cpk' files istrain_cpk = {} strain_list = load_pickle(path + 'strain_list.cpk') nucleotide_dict_path = '%s%s' % (path, 'nucleotide_fna/') for istrain in strain_list: istrain_cpk[istrain] = load_pickle(nucleotide_dict_path + istrain + '_RNA_nuc_dict.cpk') ## load RNA cluster cpk file RNACluster_path = path + 'RNA_fna/' diamond_RNACluster_dt = load_pickle(RNACluster_path + 'orthamcl-allclusters.cpk') ## load RNAID_to_RNASeqID RNASeqID cpk file RNAID_to_RNASeqID_dict = load_pickle(path + 'RNAID_to_SeqID.cpk') ## create cluster-RNAs fasta files (By default: put RNAs in geneCluster folder) fasta_path = path + 'geneCluster/' ## diamond_RNACluster_dt: {clusterID:[ count_strains,[memb1,...],count_RNAs } for clusterID, RNA in diamond_RNACluster_dt.iteritems(): ## RNACluster file name RNA_cluster_nu_filename = "%s%s" % (clusterID, '.fna') RNA_cluster_nu_write = open(fasta_path + RNA_cluster_nu_filename, 'wb') ## write nucleotide/amino_acid sequences into RNACluster files for RNA_memb in RNA[1]: ## RNA_name format: strain_1|locusTag strain_name = RNA_memb.split('|')[0] RNA_memb_seq = str(istrain_cpk[strain_name][RNA_memb]) RNASeqID = RNAID_to_RNASeqID_dict[RNA_memb] write_in_fa(RNA_cluster_nu_write, RNASeqID, RNA_memb_seq) RNA_cluster_nu_write.close() return diamond_RNACluster_dt
def align_and_makeTree(thread, alignFile_path, fa_files_list): for gene_cluster_nu_filename in fa_files_list: try: # extract GC_00002 from path/GC_00002.aln clusterID = gene_cluster_nu_filename.split('/')[-1].split('.')[0] start = time.time(); geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a') if len( read_fasta(gene_cluster_nu_filename) )==1: # nothing to do for singletons ## na.aln gene_cluster_nu_aln_filename= gene_cluster_nu_filename.replace('.fna','_na.aln') ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility with open(gene_cluster_nu_aln_filename,'wb') as write_file: for SeqID, Sequence in read_fasta(gene_cluster_nu_filename).iteritems(): write_in_fa(write_file, SeqID.replace('|','-'), Sequence) ## aa.aln gene_cluster_aa_filename= gene_cluster_nu_filename.replace('.fna','.faa') gene_cluster_aa_aln_filename= gene_cluster_nu_filename.replace('.fna','_aa.aln') ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility with open(gene_cluster_aa_aln_filename,'wb') as write_file: for SeqID, Sequence in read_fasta(gene_cluster_aa_filename).iteritems(): write_in_fa(write_file, SeqID.replace('|','-'), Sequence) geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0')) else: # align and build tree print gene_cluster_nu_filename myTree = mpm_tree(gene_cluster_nu_filename) myTree.codon_align() myTree.translate() myTree.build(raxml=False) myTree.ancestral(translate_tree=True) myTree.refine() myTree.export(path=alignFile_path) myTree.diversity_statistics() diversity=myTree.diversity gene_diversity_values='{0:.3f}'.format(diversity) geneDiversity_file.write('%s\t%s\n'%(clusterID,gene_diversity_values)) except: print("Aligning and tree building of %s failed"%gene_cluster_nu_filename)
def create_split_unclustered_files(file_path, fname, gene_list, diamond_geneCluster_dt, merged_clusters_dict): """ delete the unclustered file and create new clusters params: gene_list: lists containing the genes in the new split clusters diamond_geneCluster_dt: cluster dictionary to be updated merged_clusters_dict: merged_clusters_dict: dictionary of merged clusters (key) with original clusterIDs (value), which is used to delete the old unclustered items in diamond_geneCluster_dt """ origin_uncluster_nwk_name = fname.split('/')[-1] clusterID = origin_uncluster_nwk_name.replace('.fna', '') origin_uncluster_nu_fa = origin_uncluster_nwk_name origin_uncluster_aa_fa = origin_uncluster_nwk_name.replace('fna', 'faa') split_fa_files_set = set() try: ## delete under-clustered clusters for cluster_needed_deletion in merged_clusters_dict[ origin_uncluster_nwk_name]: if cluster_needed_deletion in diamond_geneCluster_dt: del diamond_geneCluster_dt[cluster_needed_deletion] print('deleting:', cluster_needed_deletion, ' gathered in ', origin_uncluster_nwk_name) except: #print("can't delete",origin_uncluster_nwk_name,gene_list, clusterID) print("can't delete", " under_clusterd genes gathered in ", origin_uncluster_nwk_name) ## write new cluster fa files origin_nu_fa_dt = read_fasta(file_path + origin_uncluster_nu_fa) origin_aa_fa_dt = read_fasta(file_path + origin_uncluster_aa_fa) #print gene_list ## split_gene_list has geneSeqID instead of geneID for sgs_index, split_gene_list in enumerate(gene_list, 1): newClusterId = "%s_%s" % (clusterID, sgs_index) gene_cluster_nu_filename = "%s%s" % (newClusterId, '.fna') gene_cluster_aa_filename = "%s%s" % (newClusterId, '.faa') gene_cluster_nu_write = open(file_path + gene_cluster_nu_filename, 'wb') gene_cluster_aa_write = open(file_path + gene_cluster_aa_filename, 'wb') split_fa_files_set.add(file_path + gene_cluster_nu_filename) ## write new split cluster files for gene_memb in split_gene_list: write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb]) write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb]) gene_cluster_nu_write.close() gene_cluster_aa_write.close() diamond_geneCluster_dt[newClusterId] = [0, [], 0] ## num_stains diamond_geneCluster_dt[newClusterId][0] = len( dict(Counter([ig.split('|')[0] for ig in split_gene_list])).keys()) ## num_genes diamond_geneCluster_dt[newClusterId][2] = len( dict(Counter([ig for ig in split_gene_list])).keys()) ## gene members diamond_geneCluster_dt[newClusterId][1] = [ ig.split('-')[0] for ig in split_gene_list ] return split_fa_files_set
def create_core_SNP_matrix(path): """ create SNP matrix using core gene SNPs input: strain_list.cpk, core_geneList.cpk output: SNP_whole_matrix.aln """ import os, sys, operator import numpy as np from collections import defaultdict from SF00_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa alnFilePath = '%s%s' % (path, 'geneCluster/') output_path = alnFilePath ## create core gene list corelist = [] totalStrain = len(load_pickle(path + 'strain_list.cpk')) sorted_geneList = load_sorted_clusters(path) with open(output_path + 'core_geneList.txt', 'wb') as outfile: for clusterID, vg in sorted_geneList: if vg[0] == totalStrain and vg[2] == totalStrain: coreGeneName = '%s%s' % (clusterID, '_na.aln') ## sequences might be discarded because of premature stops coreGeneName_path = alnFilePath + coreGeneName if os.path.exists(coreGeneName_path) and len( read_fasta(coreGeneName_path)) == totalStrain: outfile.write(coreGeneName + '\n') corelist.append(coreGeneName) else: print '%s%s%s' % ('warning: ', coreGeneName_path, ' is not a core gene') write_pickle(output_path + 'core_geneList.cpk', corelist) refSeqList = load_pickle(path + 'strain_list.cpk') refSeqList.sort() snp_fre_lst = [] snp_wh_matrix_flag = 0 snp_pos_dt = defaultdict(list) snp_whole_matrix = np.array([]) snps_by_gene = [] for align_file in corelist: ## all core genes fa_dt = read_fasta(alnFilePath + align_file) fa_sorted_lst = sorted(fa_dt.items(), key=lambda x: x[0].split('|')[0]) nuc_array = np.array([]) flag = 0 for ka, va in enumerate(fa_sorted_lst): if flag == 0: flag = 1 nuc_array = np.array(np.fromstring(va[1], dtype='S1')) else: nuc_array = np.vstack( (nuc_array, np.fromstring(va[1], dtype='S1'))) position_polymorphic = np.where( np.all(nuc_array == nuc_array[0, :], axis=0) == False)[0] position_has_gap = np.where(np.any(nuc_array == '-', axis=0))[0] position_SNP = np.setdiff1d(position_polymorphic, position_has_gap) snp_columns = nuc_array[:, position_SNP] snp_pos_dt[align_file] = position_SNP if snp_wh_matrix_flag == 0: snp_whole_matrix = snp_columns snp_wh_matrix_flag = 1 else: snp_whole_matrix = np.hstack((snp_whole_matrix, snp_columns)) write_pickle(output_path + 'snp_pos.cpk', snp_pos_dt) with open(output_path + 'SNP_whole_matrix.aln', 'wb') as outfile: for ind, isw in enumerate(snp_whole_matrix): write_in_fa(outfile, refSeqList[ind], isw.tostring())
def gbk_translation(each_gbk_path, nucleotide_dict_path, gb_file, output_filename, output_filename2, geneID_to_geneSeqID_dict, geneID_to_description_dict, RNAID_to_SeqID_dict, RNAID_to_description_dict, disable_RNA_clustering): ''' extract sequences and meta informations of all genes in one reference genbank file params: - each_gbk_path: path to the set of reference sequences used to construct the core genome - nucleotide_dict_path: path to the cPickled dicts of all nucleotide sequences for each genome - gb_file: name of the reference to be analyzed - output_filename: file into which all amino acid sequences are written in fasta format. needed as input for diamond - output_filename2: RNA nucleotide_sequences are written in fasta format. Needed as RNA_blast_input - geneID_to_geneSeqID_dict: dictionary linking geneID to gene sequence ID modified in place (key: geneID; value: geneSeqID ) - geneID_to_description_dict: dictionary linking geneID to description info modified in place (key: geneID; value: a dict including information on contig_index, annotation or more) - RNAID_to_SeqID_dict: dictionary linking RNAID to RNA sequence ID modified in place (key: RNAID; value: SeqID ) - RNAID_to_description_dict: dictionary linking RNAID to description info modified in place (key: RNAID; value: a dict including information on contig_index, annotation or more) - disable_RNA_clustering: not cluster rRNA and tRNA (default: 0 -> cluster RNAs) ''' reference_gb = '%s%s' % (each_gbk_path, gb_file) strainName = gb_file.split('.gbk')[0] gene_nuc_seq_dict = '%s%s_gene_nuc_dict.cpk' % (nucleotide_dict_path, strainName) gene_nucleotide_sequences = defaultdict() aa_sequence_file = open(output_filename, 'wb') if disable_RNA_clustering == 0: RNA_nuc_seq_dict = '%s%s_RNA_nuc_dict.cpk' % (nucleotide_dict_path, strainName) RNA_nucleotide_sequences = defaultdict() RNA_sequence_file = open(output_filename2, 'wb') contig_index = 0 for contig in SeqIO.parse(reference_gb, 'genbank'): contig_index += 1 for feature in contig.features: if feature.type == 'CDS': if 'product' in feature.qualifiers and 'translation' in feature.qualifiers: if 'gene' in feature.qualifiers: geneName = '%s' % ( feature.qualifiers['gene'][0]).replace(' ', '_') else: geneName = '' product = feature.qualifiers['product'][0] annotation = '_'.join(product.split(' ')) trans_seq = feature.qualifiers['translation'][0] locus_tag = feature.qualifiers['locus_tag'][0] if "PROKKA" in locus_tag: locus_tag = locus_tag.replace('PROKKA_', '') if '%s_' % strainName in locus_tag: locus_tag = locus_tag.split('%s_' % strainName)[1] ## geneID is composed of strain_name and locus_tag ## Keeping '|' separator is important, which is used later in orthAgogue. geneID = '%s|%s' % (strainName, locus_tag) write_in_fa(aa_sequence_file, geneID, trans_seq) # give tag 'gname:' to genes which have gene name and separate it from annotation geneID_to_description_dict[geneID] = { 'geneName': geneName, 'contig': contig_index, 'annotation': annotation } if geneName != '': geneName = '%s_' % geneName geneID_to_geneSeqID_dict[geneID] = '%s|%s-%d-%s%s' % ( strainName, locus_tag, contig_index, geneName, annotation) gene_nucleotide_sequences[geneID] = feature.extract( contig.seq) elif not disable_RNA_clustering and (feature.type == 'rRNA' or feature.type == 'tRNA'): if 'product' in feature.qualifiers: geneName = '' product = feature.qualifiers['product'][0] annotation = '_'.join(product.split(' ')) locus_tag = feature.qualifiers['locus_tag'][0] if "PROKKA" in locus_tag: locus_tag = locus_tag.replace('PROKKA_', '') if '%s_' % strainName in locus_tag: locus_tag = locus_tag.split('%s_' % strainName)[1] ## RNA is composed of strain_name and locus_tag ## Keeping '|' separator is important, which is used later in orthAgogue. RNAID = '%s|%s' % (strainName, locus_tag) RNA_seq = str(feature.extract(contig.seq)) write_in_fa(RNA_sequence_file, RNAID, RNA_seq) # give tag 'gname:' to genes which have gene name and separate it from annotation RNAID_to_description_dict[RNAID] = { 'geneName': '', 'contig': contig_index, 'annotation': annotation } RNAID_to_SeqID_dict[RNAID] = '%s|%s-%d-%s%s' % ( strainName, locus_tag, contig_index, geneName, annotation) RNA_nucleotide_sequences[RNAID] = RNA_seq write_pickle(gene_nuc_seq_dict, gene_nucleotide_sequences) if disable_RNA_clustering == 0: write_pickle(RNA_nuc_seq_dict, RNA_nucleotide_sequences) aa_sequence_file.close()