def build_representative_cluster(clustering_path, threads, input_prefix):
    """ build representative cluster """
    start = time.time()
    cluster_file= ''.join([clustering_path,input_prefix,'_cluster.output'])
    representative_outputfile= ''.join([clustering_path,input_prefix,'_representative','.faa'])
    subproblem_seqs_path= '%ssubproblem_cluster_seqs/'%clustering_path
    subproblem_merged_faa= ''.join([clustering_path,input_prefix,'.faa'])
    subproblem_faa_dict= read_fasta(subproblem_merged_faa)
    with open(cluster_file, 'rb') as cluster_input:
        subproblem_geneCluster_dt= defaultdict(list)
        cluster_input_lines= [iline for iline in cluster_input]
        subproblem_geneCluster_dt= {}
        subproblem_run_number= input_prefix.split('subproblem_')[1]
        for gid, iline in enumerate(cluster_input_lines):#cluster_input
            ## use time to avoid clusterID conflict
            clusterID= "GCs%s_%07d%s"%(subproblem_run_number, gid, time.strftime('%M%S',time.gmtime()))
            gene_ids= iline.rstrip().split('\t')
            subproblem_geneCluster_dt[clusterID]= gene_ids
            ## representative_seq
            representative_seq=subproblem_faa_dict[gene_ids[0]]
            ## write in representative strain
            with open(representative_outputfile, 'a') as representative_output:
                write_in_fa(representative_output, clusterID, representative_seq)
        ## write subproblem_geneCluster_dt
        write_pickle(''.join([clustering_path,input_prefix,'_dicts.cpk']), subproblem_geneCluster_dt)
    print 'build representative clusters for', input_prefix,': ', times(start), '\n'
Пример #2
0
def infer_presence_absence_associations(path, total_strains_count,
    min_strain_fraction_association, max_strain_fraction_association):
    from sf_geneCluster_align_makeTree import load_sorted_clusters
    from sf_coreTree_json import metadata_load
    metaFile= '%s%s'%(path,'metainfo.tsv')
    data_description = '%s%s'%(path,'meta_tidy.tsv')
    association_dict = defaultdict(dict)
    metadata = Metadata(metaFile, data_description)
    metadata_dict = metadata.to_dict()
    min_strains_association = total_strains_count*min_strain_fraction_association
    max_strains_association = total_strains_count*max_strain_fraction_association
    sorted_genelist = load_sorted_clusters(path)
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    # TODO fix vis
    tree = Phylo.read("%sgeneCluster/strain_tree.nwk"%(path), 'newick')
    assoc = PresenceAbsenceAssociation(tree, metadata_dict)
    for clusterID, gene in sorted_genelist:
        if gene[-1]>min_strains_association and gene[-1]<max_strains_association:
            print(clusterID)
            gl = load_gain_loss(path, clusterID)
            for col, d  in metadata.data_description.iterrows():
                if d['associate']=='yes':
                    if 'log_scale' in d and d['log_scale']=='yes':
                        t = lambda x:np.log(x)
                    else:
                        t = lambda x:x
                    assoc.set_gain_loss(gl)
                    score = assoc.calc_association_simple(d["meta_category"], transform = t)
                    if np.isinf(score):
                        association_dict[clusterID][d["meta_category"]] = 0.0
                    else:
                        association_dict[clusterID][d["meta_category"]] = np.abs(score)

    write_pickle("%s/presence_absence_association.cpk"%path, association_dict)
def build_representative_cluster(clustering_path, threads, input_prefix):
    """ build representative cluster """
    start = time.time()
    cluster_file = ''.join([clustering_path, input_prefix, '_cluster.output'])
    representative_outputfile = ''.join(
        [clustering_path, input_prefix, '_representative', '.faa'])
    subproblem_seqs_path = '%ssubproblem_cluster_seqs/' % clustering_path
    subproblem_merged_faa = ''.join([clustering_path, input_prefix, '.faa'])
    subproblem_faa_dict = read_fasta(subproblem_merged_faa)
    with open(cluster_file, 'rb') as cluster_input:
        subproblem_geneCluster_dt = defaultdict(list)
        cluster_input_lines = [iline for iline in cluster_input]
        subproblem_geneCluster_dt = {}
        subproblem_run_number = input_prefix.split('subproblem_')[1]
        for gid, iline in enumerate(cluster_input_lines):  #cluster_input
            ## use time to avoid clusterID conflict
            clusterID = "GCs%s_%07d%s" % (subproblem_run_number, gid,
                                          time.strftime('%M%S', time.gmtime()))
            gene_ids = iline.rstrip().split('\t')
            subproblem_geneCluster_dt[clusterID] = gene_ids
            ## representative_seq
            representative_seq = subproblem_faa_dict[gene_ids[0]]
            ## write in representative strain
            with open(representative_outputfile, 'a') as representative_output:
                write_in_fa(representative_output, clusterID,
                            representative_seq)
        ## write subproblem_geneCluster_dt
        write_pickle(''.join([clustering_path, input_prefix, '_dicts.cpk']),
                     subproblem_geneCluster_dt)
    print 'build representative clusters for', input_prefix, ': ', times(
        start), '\n'
Пример #4
0
def infer_branch_associations(path, total_strains_count, strain_fraction_branch_association):
    from sf_geneCluster_align_makeTree import load_sorted_clusters
    from sf_coreTree_json import metadata_load
    metaFile= '%s%s'%(path,'metainfo.tsv')
    data_description = '%s%s'%(path,'meta_tidy.tsv')
    association_dict = defaultdict(dict)
    metadata = Metadata(metaFile, data_description)
    metadata_dict = metadata.to_dict()

    sorted_genelist = load_sorted_clusters(path)
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for clusterID, gene in sorted_genelist:
        if gene[-1]>=total_strains_count*strain_fraction_branch_association: # and clusterID=='GC00001136':
            print(clusterID)
            tree = Phylo.read("%s/geneCluster/%s.nwk"%(path, clusterID), 'newick')
            assoc = BranchAssociation(tree, metadata_dict)
            for col, d  in metadata.data_description.iterrows():
                if d['associate']=='yes':
                    if 'log_scale' in d and d['log_scale']=='yes':
                        t = lambda x:np.log(x)
                    else:
                        t = lambda x:x
                    assoc.calc_up_down_averages(d["meta_category"], transform = t)
                    max_assoc = assoc.calc_significance()
                    association_dict[clusterID][d["meta_category"]] = max_assoc

    write_pickle("%s/branch_association.cpk"%path, association_dict)
def update_gene_cluster_with_RNA(path, diamond_RNACluster_dt, diamond_geneCluster_dt ):
    ## update gene cluster pickled file
    cluster_path = path+'protein_faa/diamond_matches/'

    diamond_geneCluster_dt.update(diamond_RNACluster_dt)

    write_pickle(cluster_path+'allclusters_postprocessed.cpk',diamond_geneCluster_dt)
def update_diversity_cpk(path):
    ## write gene_diversity_Dt cpk file
    output_path = path + 'geneCluster/'
    with open(output_path + 'gene_diversity.txt', 'rb') as infile:
        write_pickle(output_path + 'gene_diversity.cpk', {
            i.rstrip().split('\t')[0]: i.rstrip().split('\t')[1]
            for i in infile
        })
def update_gene_cluster_with_RNA(path, diamond_RNACluster_dt,
                                 diamond_geneCluster_dt):
    ## update gene cluster pickled file
    cluster_path = path + 'protein_faa/diamond_matches/'

    diamond_geneCluster_dt.update(diamond_RNACluster_dt)

    write_pickle(cluster_path + 'allclusters_postprocessed.cpk',
                 diamond_geneCluster_dt)
def make_genepresence_alignment(path, disable_gain_loss,
                                merged_gain_loss_output):
    '''
    loop over all gene clusters and append 0/1 to strain specific
    string used as pseudo alignment of gene presence absence
    '''
    geneClusterPath = '%s%s' % (path, 'protein_fna/diamond_matches/')
    output_path = '%s%s' % (path, 'geneCluster/')

    ## load strain list and prepare for gene presence/absence
    strain_list = load_pickle('%s%s' % (path, 'strain_list.cpk'))
    set_totalStrain = set([istrain for istrain in strain_list])
    totalStrain = len(set_totalStrain)
    dt_strainGene = defaultdict(str)

    sorted_genelist = load_sorted_clusters(path)
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for clusterID, gene in sorted_genelist:
        ## append 0/1 to each strain
        create_genePresence(dt_strainGene, totalStrain, set_totalStrain,
                            gene[1])

    with open('%s%s' % (output_path, 'genePresence.aln'),
              'wb') as presence_outfile:
        for istkey in dt_strainGene:
            write_in_fa(presence_outfile, istkey, dt_strainGene[istkey])
    write_pickle('%s%s' % (output_path, 'dt_genePresence.cpk'), dt_strainGene)

    if disable_gain_loss:
        geneEvents_dt = {i: 0 for i in range(len(sorted_genelist))}
        write_pickle('%s%s' % (output_path, 'dt_geneEvents.cpk'),
                     geneEvents_dt)
        if merged_gain_loss_output:
            gene_loss_fname = '%s%s' % (output_path, 'geneGainLossEvent.json')
            write_json(dt_strainGene, gene_loss_fname, indent=1)
        else:
            ## strainID as key, presence pattern as value (converted into np.array)
            keylist = dt_strainGene.keys()
            keylist.sort()
            strainID_keymap = {ind: k
                               for ind, k in enumerate(keylist)
                               }  # dict(zip(keylist, range(3)))
            presence_arr = np.array([
                np.array(dt_strainGene[k], 'c') for k in keylist
            ])  # 0: present, 3: absent
            presence_arr[presence_arr == '1'] = '3'
            for ind, (clusterID, gene) in enumerate(sorted_genelist):
                pattern_dt = {
                    strainID_keymap[strain_ind]: str(patt)
                    for strain_ind, patt in enumerate(presence_arr[:, ind])
                }
                pattern_fname = '%s%s_patterns.json' % (output_path, clusterID)
                write_json(pattern_dt, pattern_fname, indent=1)
Пример #9
0
def parse_RNACluster(path,inputfile):
    """ store clusters as dictionary in cpk file """
    inputfile="%s%s"%(path,inputfile)
    with open(inputfile, 'rb') as infile:
        RNACluster_dt=defaultdict(list)
        for gid, iline in enumerate(infile,1): ##format: NC_022226|1-1956082:1956435
            col=iline.rstrip().split('\t')
            clusterID="RC%05d"%gid
            num_strains=len(dict(Counter([ ivg.split('|')[0] for ivg in col])).keys())
            num_RNAs=len(dict(Counter([ ivg for ivg in col])).keys())
            RNA_mem=[ icol for icol in col ]
            RNACluster_dt[clusterID]=[num_strains,RNA_mem,num_RNAs]
    write_pickle(path+'allclusters.cpk',RNACluster_dt)
def parse_geneCluster(input_fpath, output_fpath, cluster_log=False):
    """ store clusters as dictionary in cpk file """
    with open(input_fpath, 'rb') as infile:
        geneCluster_dt=defaultdict(list)
        for gid, iline in enumerate(infile,1): ##format: NC_022226|1-1956082:1956435
            col=iline.rstrip().split('\t')
            clusterID="GC%08d"%gid
            num_strains=len(dict(Counter([ ivg.split('|')[0] for ivg in col])).keys())
            num_genes=len(dict(Counter([ ivg for ivg in col])).keys())
            gene_mem=[ icol for icol in col ]
            geneCluster_dt[clusterID]=[num_strains,gene_mem,num_genes]
    write_pickle(output_fpath,geneCluster_dt)
    return geneCluster_dt
Пример #11
0
 def make_strain_list(self):
     """ make strainID list and harmonize input filenames"""
     path = self.path
     folders_dict = self.folders_dict
     ## load input strains from all gbk or fasta files in self.path
     if self.gbk_present == 1:
         glob_item = '.gbk'
         gbk_path = folders_dict['gbk_path']
         glob_list = glob.glob('%s*%s' % (path, glob_item))
         if len(glob_list) != 0:
             harmonize_filename(path, glob_list)
             strain_list = [
                 i.split('/')[-1].split(glob_item)[0]
                 for i in glob.iglob('%s*%s' % (path, glob_item))
             ]
             ## move gbk files in folder input_GenBank
             command_organize_gbk_input = ' '.join(
                 ['mv', path + '*gbk', gbk_path])
             os.system(command_organize_gbk_input)
         else:
             gbk_glob = glob.iglob('%s*%s' % (gbk_path, glob_item))
             strain_list = [
                 i.split('/')[-1].split(glob_item)[0] for i in gbk_glob
             ]
     else:
         glob_item = '.faa'
         glob_list = glob.glob('%s*%s' % (path, glob_item))
         if len(glob_list) != 0:
             harmonize_filename(path, glob_list)
             strain_list = [
                 i.split('/')[-1].split(glob_item)[0]
                 for i in glob.iglob('%s*%s' % (path, glob_item))
             ]
         else:
             protein_glob = glob.iglob(
                 '%s*%s' % (folders_dict['protein_path'], glob_item))
             strain_list = [
                 i.split('/')[-1].split(glob_item)[0] for i in protein_glob
             ]
         command_organize_aa_input = 'mv %s*.faa %s' % (
             path, folders_dict['protein_path'])
         command_organize_nuc_input = 'mv %s*.fna %s' % (
             path, folders_dict['nucleotide_path'])
         os.system(command_organize_nuc_input)
         os.system(command_organize_aa_input)
     # write the list of strains to a pickle file and store the list in self
     write_pickle(self.fpaths_dict['strain_cpk'], strain_list)
     self.strain_list = strain_list
     self.nstrains = len(strain_list)
Пример #12
0
def parse_RNACluster(path, inputfile):
    """ store clusters as dictionary in cpk file """
    inputfile = "%s%s" % (path, inputfile)
    with open(inputfile, 'rb') as infile:
        RNACluster_dt = defaultdict(list)
        for gid, iline in enumerate(infile,
                                    1):  ##format: NC_022226|1-1956082:1956435
            col = iline.rstrip().split('\t')
            clusterID = "RC%05d" % gid
            num_strains = len(
                dict(Counter([ivg.split('|')[0] for ivg in col])).keys())
            num_RNAs = len(dict(Counter([ivg for ivg in col])).keys())
            RNA_mem = [icol for icol in col]
            RNACluster_dt[clusterID] = [num_strains, RNA_mem, num_RNAs]
    write_pickle(path + 'allclusters.cpk', RNACluster_dt)
def parse_geneCluster(input_fpath, output_fpath, cluster_log=False):
    """ store clusters as dictionary in cpk file """
    with open(input_fpath, 'rb') as infile:
        geneCluster_dt = defaultdict(list)
        for gid, iline in enumerate(infile,
                                    1):  ##format: NC_022226|1-1956082:1956435
            col = iline.rstrip().split('\t')
            clusterID = "GC%08d" % gid
            num_strains = len(
                dict(Counter([ivg.split('|')[0] for ivg in col])).keys())
            num_genes = len(dict(Counter([ivg for ivg in col])).keys())
            gene_mem = [icol for icol in col]
            geneCluster_dt[clusterID] = [num_strains, gene_mem, num_genes]
    write_pickle(output_fpath, geneCluster_dt)
    return geneCluster_dt
Пример #14
0
def make_genepresence_alignment(path, disable_gain_loss, merged_gain_loss_output):
    '''
    loop over all gene clusters and append 0/1 to strain specific
    string used as pseudo alignment of gene presence absence
    '''
    geneClusterPath='%s%s'%(path,'protein_fna/diamond_matches/')
    output_path='%s%s'%(path,'geneCluster/');

    ## load strain list and prepare for gene presence/absence
    strain_list= load_pickle('%s%s'%(path,'strain_list.cpk'))
    set_totalStrain=set([ istrain for istrain in strain_list ])
    totalStrain=len(set_totalStrain)
    dt_strainGene= defaultdict(str)

    sorted_genelist = load_sorted_clusters(path)
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for clusterID, gene in sorted_genelist:
        ## append 0/1 to each strain
        create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene[1])

    with open('%s%s'%(output_path,'genePresence.aln'),'wb') as presence_outfile:
        for istkey in dt_strainGene:
            write_in_fa( presence_outfile, istkey, dt_strainGene[istkey])
    write_pickle('%s%s'%(output_path,'dt_genePresence.cpk'), dt_strainGene)

    if disable_gain_loss:
        geneEvents_dt={ i:0 for i in range(len(sorted_genelist)) }
        write_pickle('%s%s'%(output_path,'dt_geneEvents.cpk'), geneEvents_dt)
        if merged_gain_loss_output:
            gene_loss_fname='%s%s'%(output_path,'geneGainLossEvent.json')
            write_json(dt_strainGene, gene_loss_fname, indent=1)
        else:
            ## strainID as key, presence pattern as value (converted into np.array)
            keylist= dt_strainGene.keys(); keylist.sort()
            strainID_keymap= {ind:k for ind, k in enumerate(keylist)} # dict(zip(keylist, range(3)))
            presence_arr= np.array([ np.array(dt_strainGene[k],'c') for k in keylist]) # 0: present, 3: absent
            presence_arr[presence_arr=='1']='3'
            for ind, (clusterID, gene) in enumerate(sorted_genelist):
                pattern_dt= { strainID_keymap[strain_ind]:str(patt) for strain_ind, patt in enumerate(presence_arr[:, ind])}
                pattern_fname='%s%s_patterns.json'%(output_path,clusterID)
                write_json(pattern_dt, pattern_fname, indent=1)
 def make_strain_list(self):
     """ make strainID list and harmonize input filenames"""
     path=self.path
     folders_dict=self.folders_dict
     ## load input strains from all gbk or fasta files in self.path
     if self.gbk_present==1:
         glob_item='.gbk'
         gbk_path=folders_dict['gbk_path']
         glob_list=glob.glob('%s*%s'%(path,glob_item))
         if len(glob_list)!=0:
             harmonize_filename(path,glob_list)
             strain_list= [i.split('/')[-1].split(glob_item)[0]
                           for i in glob.iglob('%s*%s'%(path,glob_item))]
             ## move gbk files in folder input_GenBank
             command_organize_gbk_input=' '.join(['mv', path+'*gbk',gbk_path])
             os.system(command_organize_gbk_input)
         else:
             gbk_glob=glob.iglob('%s*%s'%(gbk_path,glob_item))
             strain_list= [i.split('/')[-1].split(glob_item)[0] for i in gbk_glob]
     else:
         glob_item='.faa'
         glob_list=glob.glob('%s*%s'%(path,glob_item))
         if len(glob_list)!=0:
             harmonize_filename(path,glob_list)
             strain_list=[i.split('/')[-1].split(glob_item)[0]
                          for i in glob.iglob('%s*%s'%(path,glob_item))]
         else:
             protein_glob=glob.iglob('%s*%s'%(folders_dict['protein_path'],glob_item))
             strain_list= [i.split('/')[-1].split(glob_item)[0] for i in protein_glob]
         command_organize_aa_input= 'mv %s*.faa %s'%(path,folders_dict['protein_path'])
         command_organize_nuc_input='mv %s*.fna %s'%(path,folders_dict['nucleotide_path'])
         os.system(command_organize_nuc_input)
         os.system(command_organize_aa_input)
     # write the list of strains to a pickle file and store the list in self
     write_pickle(self.fpaths_dict['strain_cpk'], strain_list)
     self.strain_list=strain_list
     self.nstrains=len(strain_list)
def create_core_SNP_matrix(path, core_cutoff=1.0, core_gene_strain_fpath=''):#1.0
    """ create SNP matrix using core gene SNPs
        input: strain_list.cpk, core_geneList.cpk
        output: SNP_whole_matrix.aln
        core_cutoff: percentage of strains used to decide whether a gene is core
            default: 1.0 (strictly core gene, which is present in all strains)
            customized: 0.9 ( soft core, considered as core if present in 90% of strains)
    """
    import os,sys,operator
    import numpy as np
    import numpy.ma as ma
    from collections import defaultdict
    from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa

    alnFilePath='%s%s'%(path,'geneCluster/')
    output_path= alnFilePath

    ## create core gene list
    corelist=[]
    strain_list=load_pickle(path+'strain_list.cpk')
    totalStrain= len(strain_list)
    sorted_geneList = load_sorted_clusters(path)
    if core_gene_strain_fpath!='':
        with open(core_gene_strain_fpath,'rb') as core_gene_strain_file:
            core_strain_set= set([i.rstrip().replace('-','_') for i in core_gene_strain_file])
    with open(output_path+'core_geneList.txt','wb') as outfile:
        for clusterID, vg in sorted_geneList:
            if core_cutoff==1.0:
                strain_core_cutoff=totalStrain
            else:
                strain_core_cutoff=int(totalStrain*core_cutoff)
            if vg[0]==vg[2] and vg[0]>=strain_core_cutoff:
                coreGeneName='%s%s'%(clusterID,'_na_aln.fa')
                ## sequences might be discarded because of premature stops
                coreGeneName_path= alnFilePath+coreGeneName
                if os.path.exists(coreGeneName_path) and len(read_fasta(coreGeneName_path)) >= strain_core_cutoff:
                    if core_gene_strain_fpath!='' and len(core_strain_set-set([i.split('|')[0] for i in vg[1]]))!=0:
                        continue
                    outfile.write(coreGeneName+'\n')
                    corelist.append(coreGeneName)
                else:
                    #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene')
                    pass

        write_pickle(output_path+'core_geneList.cpk',corelist)

    refSeqList=load_pickle(path+'strain_list.cpk');refSeqList.sort()

    snp_fre_lst=[]; snp_wh_matrix_flag=0
    snp_pos_dt=defaultdict(list); snp_whole_matrix=np.array([])
    snps_by_gene=[]
    for align_file in corelist:## core genes
        nuc_array=np.array([]) # array to store nucleotides for each gene
        gene_seq_dt=read_fasta(alnFilePath+align_file)
        if core_cutoff!=1.0:
            # set sequences for missing gene (space*gene_length)
            missing_gene_seq=' '*len(gene_seq_dt.values()[0])
            totalStrain_sorted_lst=sorted(strain_list)
        # build strain_seq_dt from gene_seq_dt
        strain_seq_dt=defaultdict()
        for gene, seq in gene_seq_dt.iteritems():
            strain_seq_dt[gene.split('-')[0]]=seq # strain-locus_tag-...
        strain_seq_sorted_lst=sorted(strain_seq_dt.items(), key=lambda x: x[0])

        start_flag=0
        if core_cutoff==1.0:
            for ka, va in strain_seq_sorted_lst:
                if start_flag==0:
                    nuc_array=np.array(np.fromstring(va, dtype='S1'))
                    start_flag=1
                else:
                    nuc_array=np.vstack((nuc_array,np.fromstring(va, dtype='S1')))
            ## find SNP positions
            position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis = 0)
            position_has_gap = np.any(nuc_array=='-', axis=0)
            position_SNP = position_polymorphic&(~position_has_gap)
            snp_columns = nuc_array[:,position_SNP]
            snp_pos_dt[align_file]=np.where(position_SNP)[0]
        else:
        ## add '-' for missing genes when dealing with soft core genes
            core_gene_strain=[ gene for gene in strain_seq_dt.keys()]
            for strain in totalStrain_sorted_lst:
                if start_flag==0:
                    if strain in core_gene_strain:
                        nuc_array=np.array(np.fromstring(strain_seq_dt[strain], dtype='S1'))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file)
                        nuc_array=np.array(np.fromstring(missing_gene_seq, dtype='S1'))
                    start_flag=1
                else:
                    if strain in core_gene_strain:
                        nuc_array=np.vstack((nuc_array,np.fromstring(strain_seq_dt[strain], dtype='S1')))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file)
                        nuc_array=np.vstack((nuc_array,np.fromstring(missing_gene_seq, dtype='S1')))
            ## find SNP positions
            ## mask missing genes -- determine rows that have ' ' in every column
            is_missing = np.all(nuc_array==' ',axis=1)
            masked_non_missing_array= np.ma.masked_array(nuc_array, nuc_array==' ')
            position_polymorphic = np.any(masked_non_missing_array!= masked_non_missing_array[0, :],axis = 0)
            position_has_gap = np.any(masked_non_missing_array=='-',axis=0)
            position_SNP = position_polymorphic&(~position_has_gap)
            # the below seems duplicated from 5 lines above??
            if is_missing.sum()>0: # with missing genes
                nuc_array[is_missing]='-'
            snp_columns = nuc_array[:,position_SNP]
            snp_pos_dt[align_file]=np.where(position_SNP)[0]
            #print snp_columns

        if snp_wh_matrix_flag==0:
            snp_whole_matrix=snp_columns;
            snp_wh_matrix_flag=1
        else:
            snp_whole_matrix=np.hstack((snp_whole_matrix, snp_columns))
    write_pickle(output_path+'snp_pos.cpk',snp_pos_dt)

    with open(output_path+'SNP_whole_matrix.aln','wb') as outfile:
        for ind, isw in enumerate(snp_whole_matrix):
            write_in_fa( outfile, refSeqList[ind], isw.tostring() )
def output_cutted_clusters(file_path, uncluster_filename, gene_list, cut_branch_threshold, treefile_used=None, cut_leftover=None):
    """
    delete the unclustered file and create new clusters
    params:
        gene_list: lists containing the genes in the new split clusters
        geneCluster_dt: cluster dictionary to be updated
        cut_leftover: flag to indicate whether there are the leftover nodes
            after cutting long branches. Default: empty.
    """
    clusterID = uncluster_filename.replace('.fna','')
    origin_uncluster_nu_fa = uncluster_filename
    origin_uncluster_aa_fa = uncluster_filename.replace('fna','faa')

    new_fa_files=set()

    ## load origin cluster fa files
    origin_nu_fa_dt = read_fasta(file_path+origin_uncluster_nu_fa)
    origin_aa_fa_dt = read_fasta(file_path+origin_uncluster_aa_fa)

    ## split_gene_list has geneSeqID instead of geneID
    for sgs_index,split_gene_list in enumerate(gene_list,1):
        if cut_leftover==True:
            ## newClusterId for the rest genes (_r as identifier)
            newClusterId="%s_r%s"%(clusterID,sgs_index)
        else:
            newClusterId="%s_%s"%(clusterID,sgs_index)

        #=============================================
        ## write new divided/split cluster files
        gene_cluster_nu_filename="%s%s"%(newClusterId,'.fna')
        gene_cluster_nu_filepath= file_path+gene_cluster_nu_filename
        gene_cluster_nu_write=open(gene_cluster_nu_filepath , 'wb')

        gene_cluster_aa_filename="%s%s"%(newClusterId,'.faa')
        gene_cluster_aa_filepath= file_path+gene_cluster_aa_filename
        gene_cluster_aa_write=open( file_path+gene_cluster_aa_filename, 'wb')

        for gene_memb in split_gene_list:
            if "\\'" in gene_memb: # Replace '\' in node name:
                ## NC_018495|CM9_RS01675-1-guanosine-3',5'-... in fasta ID
                ## 'NC_018495|CM9_RS01675-1-guanosine-3\',5\'-...' in nwk node name
                ## Use origin_nu_fa_dt[gene_memb] will throw the KeyError:
                ## "NC_018495|CM9_RS01675-1-guanosine-3\\',5\\'"
                gene_memb=gene_memb.replace("\\'","'")

            write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb])
            write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb])
        gene_cluster_nu_write.close(); gene_cluster_aa_write.close();
        #=============================================

        if cut_leftover==True:
            ## align the rest genes, build tree, cut long branches till nothing can be cutted.
            cutTree_outputCluster([gene_cluster_nu_filepath],file_path, cut_branch_threshold, treefile_used)
        else:
            ## record the misclusters to be deleted (already addressed in cutTree_outputCluster )
            ## it will output the same cluster several times
            #with open(file_path+'old_clusters_longSplit.txt', 'a') as delete_cluster_file:
            #    delete_cluster_file.write('%s\n'%uncluster_filename)

            ## add record in new_clusters_longSplit.txt, which is used for align new clusters
            new_fa_files.add(gene_cluster_nu_filepath)

            ## write cluster statistics in folder update_long_branch_splits
            addin_geneCluster_dt=defaultdict(list)
            addin_geneCluster_dt[ newClusterId ] = [0,[],0]
            ## num_stains
            addin_geneCluster_dt[ newClusterId ][0]=len(dict(Counter([ ig.split('|')[0] for ig in split_gene_list])).keys())
            ## num_genes
            addin_geneCluster_dt[ newClusterId ][2]=len(dict(Counter([ ig for ig in split_gene_list])).keys())
            ## gene members
            addin_geneCluster_dt[ newClusterId ][1]=[ ig.split('-')[0] for ig in split_gene_list ]
            ## cPickle new cluster statistics
            write_pickle(''.join([file_path,'update_long_branch_splits/', newClusterId,'.cpk']),addin_geneCluster_dt)

    ## write records in gene_diversity file
    with open(file_path+'new_clusters_longSplit.txt', 'a') as refined_cluster_file:
        for i in new_fa_files:
            refined_cluster_file.write('%s\n'%i)
def estimate_core_gene_diversity(path, folders_dict, strain_list, parallel, core_cutoff, factor_core_diversity, species):
    """
    estimate core gene diversity before gene cluster alignment
    and cluster post-processing
    """
    totalStrain= len(strain_list)

    ## load clusters
    clustering_path= folders_dict['clustering_path']
    geneCluster_dt= load_pickle(clustering_path+'allclusters.cpk')
    protein_path= folders_dict['protein_path']
    nucleotide_path= folders_dict['nucleotide_path']
    protein_dict_path= '%s%s'%(protein_path,'all_protein_seq.cpk')
    nucleotide_dict_path= '%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk')
    tmp_core_seq_path= '%s%s'%(clustering_path,'tmp_core/')
    ## load geneID_to_geneSeqID geneSeqID cpk file
    geneID_to_geneSeqID_dict= load_pickle(path+'geneID_to_geneSeqID.cpk')

    ## create core gene list
    core_geneCluster_dt= defaultdict()
    # geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes }
    for clusterID, cluster_stats in geneCluster_dt.iteritems():
        if core_cutoff==1.0:
            strain_core_cutoff=totalStrain
        else:
            strain_core_cutoff=int(totalStrain*core_cutoff)
        ## check whether #genes == #strains and it's a core/soft-core gene
        if cluster_stats[0]==cluster_stats[2] and cluster_stats[0]>=strain_core_cutoff:
            core_geneCluster_dt[clusterID]=cluster_stats
    if os.path.exists(tmp_core_seq_path):
        os.system(''.join(['rm -rf ',tmp_core_seq_path]))
    os.system('mkdir %s'%tmp_core_seq_path)

    ## create dict storing all genes' translation
    if 0:
        gene_aa_dict= defaultdict(dict)
        for accession_id in strain_list:
            gene_aa_dict[accession_id]= read_fasta(''.join([protein_path,accession_id,'.faa']))
        write_pickle(protein_dict_path, gene_aa_dict)

        ## create dict for all gene's nucleotide sequence
        gene_na_dict= defaultdict(dict)
        for accession_id in strain_list:
            gene_na_dict[accession_id]=read_fasta(''.join([nucleotide_path,accession_id,'.fna']))
        write_pickle(nucleotide_dict_path, gene_na_dict)

    gene_aa_dict= load_pickle(protein_dict_path)
    gene_na_dict= load_pickle(nucleotide_dict_path)

    ## write nucleotide and amino-acid sequences for each gene cluster
    export_cluster_seq_tmp(tmp_core_seq_path, core_geneCluster_dt, geneID_to_geneSeqID_dict,
        gene_na_dict, gene_aa_dict)

    tmp_fa_files=glob.glob(tmp_core_seq_path+"*.fna")
    multips(calculate_diversity, parallel, tmp_fa_files, tmp_core_seq_path, species)

    calculated_core_diversity=tmp_average_core_diversity(tmp_core_seq_path)
    refined_core_diversity= round((0.1+factor_core_diversity*calculated_core_diversity)/(1+factor_core_diversity*calculated_core_diversity),4)
    print('factor used: '+str(factor_core_diversity))
    print('average core genome diversity: '+str(calculated_core_diversity))
    print('defined core genome diversity cutoff for splitting long branches: '+str(refined_core_diversity))

    ## move folder tmp_core to the central data folder
    new_clustering_path= '%stmp_core'%path
    if os.path.exists(new_clustering_path):
        os.system(''.join(['rm -r ',new_clustering_path]))
    os.system('mv %s %s'%(tmp_core_seq_path, path))
    return calculated_core_diversity, refined_core_diversity
def update_geneCluster_cpk(path, geneCluster_dt):
    ## update gene cluster pickled file
    cluster_path = path+'protein_faa/diamond_matches/'
    write_pickle(cluster_path+'allclusters_postprocessed.cpk',geneCluster_dt)
Пример #20
0
def export_gain_loss(tree, path, merged_gain_loss_output):
    '''
    '''
    # write final tree with internal node names as assigned by treetime
    sep = '/'
    output_path = sep.join([path.rstrip(sep), 'geneCluster/'])
    events_dict_path = sep.join([output_path, 'dt_geneEvents.cpk'])
    gene_pattern_dict_path = sep.join([output_path, 'dt_genePattern.cpk'])

    tree_fname = sep.join([output_path, 'strain_tree.nwk'])
    Phylo.write(tree.tree, tree_fname, 'newick')

    gene_gain_loss_dict = defaultdict(str)
    preorder_strain_list = []  #store the preorder nodes as strain list
    for node in tree.tree.find_clades(
            order='preorder'):  # order does not matter much here
        if node.up is None: continue
        #print(node.name ,len(node.geneevents),node.geneevents)
        gain_loss = [
            str(int(ancestral) * 2 + int(derived)) for ancestral, derived in
            zip(node.up.genepresence, node.genepresence)
        ]
        gene_gain_loss_dict[node.name] = "".join(gain_loss)
        preorder_strain_list.append(node.name)

    gain_loss_array = np.array(
        [[i for i in gain_loss_str]
         for gain_loss_str in gene_gain_loss_dict.values()],
        dtype=int)
    # 1 and 2 are codes for gain/loss events
    events_array = ((gain_loss_array == 1) |
                    (gain_loss_array == 2)).sum(axis=0)
    events_dict = {index: event for index, event in enumerate(events_array)}

    write_pickle(events_dict_path, events_dict)

    if merged_gain_loss_output:
        ## export gene loss dict to json for visualization
        #gene_loss_fname = sep.join([ output_path, 'geneGainLossEvent.json'])
        #write_json(gene_gain_loss_dict, gene_loss_fname, indent=1)
        write_pickle(gene_pattern_dict_path, gene_gain_loss_dict)
    else:
        ## strainID as key, presence pattern as value (converted into np.array)
        sorted_genelist = load_sorted_clusters(path)
        strainID_keymap = {
            ind: k
            for ind, k in enumerate(preorder_strain_list)
        }
        #presence_arr= np.array([ np.fromstring(gene_gain_loss_dict[k], np.int8)-48 for k in preorder_strain_list])
        presence_arr = np.array([
            np.array(gene_gain_loss_dict[k], 'c') for k in preorder_strain_list
        ])
        ## if true, write pattern dict instead of pattern string in a json file
        pattern_json_flag = False
        for ind, (clusterID, gene) in enumerate(sorted_genelist):
            pattern_fname = '%s%s_patterns.json' % (output_path, clusterID)
            if pattern_json_flag:
                pattern_dt = {
                    strainID_keymap[strain_ind]: str(patt)
                    for strain_ind, patt in enumerate(presence_arr[:, ind])
                }
                write_json(pattern_dt, pattern_fname, indent=1)
            #print(preorder_strain_list,clusterID)
            #print(''.join([ str(patt) for patt in presence_arr[:, ind]]))
            with open(pattern_fname, 'w') as write_pattern:
                write_pattern.write(
                    '{"patterns":"' +
                    ''.join([str(patt)
                             for patt in presence_arr[:, ind]]) + '"}')
def create_core_SNP_matrix(path,
                           core_cutoff=1.0,
                           core_gene_strain_fpath=''):  #1.0
    """ create SNP matrix using core gene SNPs
        input: strain_list.cpk, core_geneList.cpk
        output: SNP_whole_matrix.aln
        core_cutoff: percentage of strains used to decide whether a gene is core
            default: 1.0 (strictly core gene, which is present in all strains)
            customized: 0.9 ( soft core, considered as core if present in 90% of strains)
    """
    import os, sys, operator
    import numpy as np
    import numpy.ma as ma
    from collections import defaultdict
    from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa

    alnFilePath = '%s%s' % (path, 'geneCluster/')
    output_path = alnFilePath

    ## create core gene list
    corelist = []
    strain_list = load_pickle(path + 'strain_list.cpk')
    totalStrain = len(strain_list)
    sorted_geneList = load_sorted_clusters(path)
    if core_gene_strain_fpath != '':
        with open(core_gene_strain_fpath, 'rb') as core_gene_strain_file:
            core_strain_set = set(
                [i.rstrip().replace('-', '_') for i in core_gene_strain_file])
    with open(output_path + 'core_geneList.txt', 'wb') as outfile:
        for clusterID, vg in sorted_geneList:
            if core_cutoff == 1.0:
                strain_core_cutoff = totalStrain
            else:
                strain_core_cutoff = int(totalStrain * core_cutoff)
            if vg[0] == vg[2] and vg[0] >= strain_core_cutoff:
                coreGeneName = '%s%s' % (clusterID, '_na_aln.fa')
                ## sequences might be discarded because of premature stops
                coreGeneName_path = alnFilePath + coreGeneName
                if os.path.exists(coreGeneName_path) and len(
                        read_fasta(coreGeneName_path)) >= strain_core_cutoff:
                    if core_gene_strain_fpath != '' and len(
                            core_strain_set -
                            set([i.split('|')[0] for i in vg[1]])) != 0:
                        continue
                    outfile.write(coreGeneName + '\n')
                    corelist.append(coreGeneName)
                else:
                    #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene')
                    pass

        write_pickle(output_path + 'core_geneList.cpk', corelist)

    refSeqList = load_pickle(path + 'strain_list.cpk')
    refSeqList.sort()

    snp_fre_lst = []
    snp_wh_matrix_flag = 0
    snp_pos_dt = defaultdict(list)
    snp_whole_matrix = np.array([])
    snps_by_gene = []
    for align_file in corelist:  ## core genes
        nuc_array = np.array([])  # array to store nucleotides for each gene
        gene_seq_dt = read_fasta(alnFilePath + align_file)
        if core_cutoff != 1.0:
            # set sequences for missing gene (space*gene_length)
            missing_gene_seq = ' ' * len(gene_seq_dt.values()[0])
            totalStrain_sorted_lst = sorted(strain_list)
        # build strain_seq_dt from gene_seq_dt
        strain_seq_dt = defaultdict()
        for gene, seq in gene_seq_dt.iteritems():
            strain_seq_dt[gene.split('-')[0]] = seq  # strain-locus_tag-...
        strain_seq_sorted_lst = sorted(strain_seq_dt.items(),
                                       key=lambda x: x[0])

        start_flag = 0
        if core_cutoff == 1.0:
            for ka, va in strain_seq_sorted_lst:
                if start_flag == 0:
                    nuc_array = np.array(np.fromstring(va, dtype='S1'))
                    start_flag = 1
                else:
                    nuc_array = np.vstack(
                        (nuc_array, np.fromstring(va, dtype='S1')))
            ## find SNP positions
            position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis=0)
            position_has_gap = np.any(nuc_array == '-', axis=0)
            position_SNP = position_polymorphic & (~position_has_gap)
            snp_columns = nuc_array[:, position_SNP]
            snp_pos_dt[align_file] = np.where(position_SNP)[0]
        else:
            ## add '-' for missing genes when dealing with soft core genes
            core_gene_strain = [gene for gene in strain_seq_dt.keys()]
            for strain in totalStrain_sorted_lst:
                if start_flag == 0:
                    if strain in core_gene_strain:
                        nuc_array = np.array(
                            np.fromstring(strain_seq_dt[strain], dtype='S1'))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s' % (
                            strain, align_file)
                        nuc_array = np.array(
                            np.fromstring(missing_gene_seq, dtype='S1'))
                    start_flag = 1
                else:
                    if strain in core_gene_strain:
                        nuc_array = np.vstack(
                            (nuc_array,
                             np.fromstring(strain_seq_dt[strain], dtype='S1')))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s' % (
                            strain, align_file)
                        nuc_array = np.vstack((nuc_array,
                                               np.fromstring(missing_gene_seq,
                                                             dtype='S1')))
            ## find SNP positions
            ## mask missing genes -- determine rows that have ' ' in every column
            is_missing = np.all(nuc_array == ' ', axis=1)
            masked_non_missing_array = np.ma.masked_array(
                nuc_array, nuc_array == ' ')
            position_polymorphic = np.any(
                masked_non_missing_array != masked_non_missing_array[0, :],
                axis=0)
            position_has_gap = np.any(masked_non_missing_array == '-', axis=0)
            position_SNP = position_polymorphic & (~position_has_gap)
            # the below seems duplicated from 5 lines above??
            if is_missing.sum() > 0:  # with missing genes
                nuc_array[is_missing] = '-'
            snp_columns = nuc_array[:, position_SNP]
            snp_pos_dt[align_file] = np.where(position_SNP)[0]
            #print snp_columns

        if snp_wh_matrix_flag == 0:
            snp_whole_matrix = snp_columns
            snp_wh_matrix_flag = 1
        else:
            snp_whole_matrix = np.hstack((snp_whole_matrix, snp_columns))
    write_pickle(output_path + 'snp_pos.cpk', snp_pos_dt)

    with open(output_path + 'SNP_whole_matrix.aln', 'wb') as outfile:
        for ind, isw in enumerate(snp_whole_matrix):
            write_in_fa(outfile, refSeqList[ind], isw.tostring())
def update_diversity_cpk(path):
    ## write gene_diversity_Dt cpk file
    output_path = path+'geneCluster/'
    with open(output_path+'gene_diversity.txt', 'rb') as infile:
        write_pickle(output_path+'gene_diversity.cpk',{ i.rstrip().split('\t')[0]:i.rstrip().split('\t')[1] for i in infile})
def update_geneCluster_cpk(path, geneCluster_dt):
    ## update gene cluster pickled file
    cluster_path = path+'protein_faa/diamond_matches/'
    write_pickle(cluster_path+'allclusters_postprocessed.cpk',geneCluster_dt)
def extract_sequences(path, strain_list, folders_dict, gbk_present, enable_RNA_clustering):
    '''
        go through all GenBank files and extract sequences and metadata for each one
    '''
    gbk_path= folders_dict['gbk_path']
    protein_path= folders_dict['protein_path']
    nucleotide_path= folders_dict['nucleotide_path']
    RNA_path= folders_dict['RNA_path']

    geneID_to_geneSeqID_file= '%sgeneID_to_geneSeqID.cpk'%path
    geneID_to_description_file= '%sgeneID_to_description.cpk'%path
    RNAID_to_SeqID_file= '%sRNAID_to_SeqID.cpk'%path
    RNAID_to_description_file= '%sRNAID_to_description.cpk'%path

    protein_dict_path= '%s%s'%(protein_path,'all_protein_seq.cpk')
    nucleotide_dict_path= '%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk')
    RNA_dict_path= '%s%s'%(RNA_path,'all_RNA_seq.cpk')

    geneID_to_geneSeqID_dict= defaultdict()
    geneID_to_description_dict= defaultdict()
    RNAID_to_SeqID_dict= defaultdict()
    RNAID_to_description_dict= defaultdict()
    gene_aa_dict= defaultdict(dict)
    gene_na_dict= defaultdict(dict)
    RNA_dict= defaultdict(dict)

    if gbk_present:
        ## clean up folder when data from previous run exist.
        os.system('rm -rf '+protein_path+'*.faa')
        os.system('rm -rf '+nucleotide_path+'*.fna')
        missing_CDS_list=[] ## a list containing strains which have no CDS (if any)
        ## process gbk file
        for strainID in strain_list:
            gbk_fname= ''.join([gbk_path,strainID,'.gbk'])
            protein_fname= ''.join([protein_path,strainID,'.faa'])
            nucleotide_fname= ''.join([nucleotide_path,strainID,'.fna'])
            RNA_fname= ''.join([RNA_path,strainID,'.fna'])
            check_CDS_passed= gbk_translation(strainID, gbk_fname, protein_fname, nucleotide_fname, RNA_fname,
                geneID_to_geneSeqID_dict,geneID_to_description_dict,
                RNAID_to_SeqID_dict, RNAID_to_description_dict,
                gene_aa_dict, gene_na_dict, RNA_dict, enable_RNA_clustering)
            if not check_CDS_passed:
                missing_CDS_list.append(strainID)
        if len(missing_CDS_list)!=0:
            print 'Warning: no CDS found in the following genome/genomes, please double-check\n', missing_CDS_list
            exit()
    else:
        ## process fna/faa files if gbk files are not given.
        for strainID in strain_list:
            ## amino acid sequences
            protein_fname=''.join([protein_path,strainID,'.faa'])
            nucleotide_fname=''.join([nucleotide_path,strainID,'.fna'])
            aa_sequence_dt=read_fasta(protein_fname)
            na_sequence_dt=read_fasta(nucleotide_fname)
            ## prepare geneSeqID and description
            for geneID in aa_sequence_dt.keys():
                geneName, annotation= '',''
                geneID_to_geneSeqID_dict[geneID]=geneID
                geneID_to_description_dict[geneID]={'geneName': geneName,
                                                    'annotation': annotation}
                gene_aa_dict[strainID][geneID]=aa_sequence_dt[geneID]
                gene_na_dict[strainID][geneID]=na_sequence_dt[geneID]
    write_pickle(geneID_to_geneSeqID_file, geneID_to_geneSeqID_dict)
    write_pickle(geneID_to_description_file, geneID_to_description_dict)
    write_pickle(protein_dict_path,gene_aa_dict)
    write_pickle(nucleotide_dict_path,gene_na_dict)
    ## option: process RNA sequences for RNA_clustering
    if enable_RNA_clustering:
        write_pickle(RNA_dict_path,RNA_dict)
        write_pickle(RNAID_to_SeqID_file, RNAID_to_SeqID_dict)
        write_pickle(RNAID_to_description_file, RNAID_to_description_dict)
    return gene_aa_dict, gene_na_dict
def output_cutted_clusters(file_path,
                           uncluster_filename,
                           gene_list,
                           cut_branch_threshold,
                           treefile_used=None,
                           cut_leftover=None):
    """
    delete the unclustered file and create new clusters
    params:
        gene_list: lists containing the genes in the new split clusters
        geneCluster_dt: cluster dictionary to be updated
        cut_leftover: flag to indicate whether there are the leftover nodes
            after cutting long branches. Default: empty.
    """
    clusterID = uncluster_filename.replace('.fna', '')
    origin_uncluster_nu_fa = uncluster_filename
    origin_uncluster_aa_fa = uncluster_filename.replace('fna', 'faa')

    new_fa_files = set()

    ## load origin cluster fa files
    origin_nu_fa_dt = read_fasta(file_path + origin_uncluster_nu_fa)
    origin_aa_fa_dt = read_fasta(file_path + origin_uncluster_aa_fa)

    ## split_gene_list has geneSeqID instead of geneID
    for sgs_index, split_gene_list in enumerate(gene_list, 1):
        if cut_leftover == True:
            ## newClusterId for the rest genes (_r as identifier)
            newClusterId = "%s_r%s" % (clusterID, sgs_index)
        else:
            newClusterId = "%s_%s" % (clusterID, sgs_index)

        #=============================================
        ## write new divided/split cluster files
        gene_cluster_nu_filename = "%s%s" % (newClusterId, '.fna')
        gene_cluster_nu_filepath = file_path + gene_cluster_nu_filename
        gene_cluster_nu_write = open(gene_cluster_nu_filepath, 'wb')

        gene_cluster_aa_filename = "%s%s" % (newClusterId, '.faa')
        gene_cluster_aa_filepath = file_path + gene_cluster_aa_filename
        gene_cluster_aa_write = open(file_path + gene_cluster_aa_filename,
                                     'wb')

        for gene_memb in split_gene_list:
            if "\\'" in gene_memb:  # Replace '\' in node name:
                ## NC_018495|CM9_RS01675-1-guanosine-3',5'-... in fasta ID
                ## 'NC_018495|CM9_RS01675-1-guanosine-3\',5\'-...' in nwk node name
                ## Use origin_nu_fa_dt[gene_memb] will throw the KeyError:
                ## "NC_018495|CM9_RS01675-1-guanosine-3\\',5\\'"
                gene_memb = gene_memb.replace("\\'", "'")

            write_in_fa(gene_cluster_nu_write, gene_memb,
                        origin_nu_fa_dt[gene_memb])
            write_in_fa(gene_cluster_aa_write, gene_memb,
                        origin_aa_fa_dt[gene_memb])
        gene_cluster_nu_write.close()
        gene_cluster_aa_write.close()
        #=============================================

        if cut_leftover == True:
            ## align the rest genes, build tree, cut long branches till nothing can be cutted.
            cutTree_outputCluster([gene_cluster_nu_filepath], file_path,
                                  cut_branch_threshold, treefile_used)
        else:
            ## record the misclusters to be deleted (already addressed in cutTree_outputCluster )
            ## it will output the same cluster several times
            #with open(file_path+'old_clusters_longSplit.txt', 'a') as delete_cluster_file:
            #    delete_cluster_file.write('%s\n'%uncluster_filename)

            ## add record in new_clusters_longSplit.txt, which is used for align new clusters
            new_fa_files.add(gene_cluster_nu_filepath)

            ## write cluster statistics in folder update_long_branch_splits
            addin_geneCluster_dt = defaultdict(list)
            addin_geneCluster_dt[newClusterId] = [0, [], 0]
            ## num_stains
            addin_geneCluster_dt[newClusterId][0] = len(
                dict(Counter([ig.split('|')[0]
                              for ig in split_gene_list])).keys())
            ## num_genes
            addin_geneCluster_dt[newClusterId][2] = len(
                dict(Counter([ig for ig in split_gene_list])).keys())
            ## gene members
            addin_geneCluster_dt[newClusterId][1] = [
                ig.split('-')[0] for ig in split_gene_list
            ]
            ## cPickle new cluster statistics
            write_pickle(
                ''.join([
                    file_path, 'update_long_branch_splits/', newClusterId,
                    '.cpk'
                ]), addin_geneCluster_dt)

    ## write records in gene_diversity file
    with open(file_path + 'new_clusters_longSplit.txt',
              'a') as refined_cluster_file:
        for i in new_fa_files:
            refined_cluster_file.write('%s\n' % i)
def estimate_core_gene_diversity(path, folders_dict, strain_list, parallel,
                                 core_cutoff, factor_core_diversity, species):
    """
    estimate core gene diversity before gene cluster alignment
    and cluster post-processing
    """
    totalStrain = len(strain_list)

    ## load clusters
    clustering_path = folders_dict['clustering_path']
    geneCluster_dt = load_pickle(clustering_path + 'allclusters.cpk')
    protein_path = folders_dict['protein_path']
    nucleotide_path = folders_dict['nucleotide_path']
    protein_dict_path = '%s%s' % (protein_path, 'all_protein_seq.cpk')
    nucleotide_dict_path = '%s%s' % (nucleotide_path, 'all_nucleotide_seq.cpk')
    tmp_core_seq_path = '%s%s' % (clustering_path, 'tmp_core/')
    ## load geneID_to_geneSeqID geneSeqID cpk file
    geneID_to_geneSeqID_dict = load_pickle(path + 'geneID_to_geneSeqID.cpk')

    ## create core gene list
    core_geneCluster_dt = defaultdict()
    # geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes }
    for clusterID, cluster_stats in geneCluster_dt.iteritems():
        if core_cutoff == 1.0:
            strain_core_cutoff = totalStrain
        else:
            strain_core_cutoff = int(totalStrain * core_cutoff)
        ## check whether #genes == #strains and it's a core/soft-core gene
        if cluster_stats[0] == cluster_stats[
                2] and cluster_stats[0] >= strain_core_cutoff:
            core_geneCluster_dt[clusterID] = cluster_stats
    if os.path.exists(tmp_core_seq_path):
        os.system(''.join(['rm -rf ', tmp_core_seq_path]))
    os.system('mkdir %s' % tmp_core_seq_path)

    ## create dict storing all genes' translation
    if 0:
        gene_aa_dict = defaultdict(dict)
        for accession_id in strain_list:
            gene_aa_dict[accession_id] = read_fasta(''.join(
                [protein_path, accession_id, '.faa']))
        write_pickle(protein_dict_path, gene_aa_dict)

        ## create dict for all gene's nucleotide sequence
        gene_na_dict = defaultdict(dict)
        for accession_id in strain_list:
            gene_na_dict[accession_id] = read_fasta(''.join(
                [nucleotide_path, accession_id, '.fna']))
        write_pickle(nucleotide_dict_path, gene_na_dict)

    gene_aa_dict = load_pickle(protein_dict_path)
    gene_na_dict = load_pickle(nucleotide_dict_path)

    ## write nucleotide and amino-acid sequences for each gene cluster
    export_cluster_seq_tmp(tmp_core_seq_path, core_geneCluster_dt,
                           geneID_to_geneSeqID_dict, gene_na_dict,
                           gene_aa_dict)

    tmp_fa_files = glob.glob(tmp_core_seq_path + "*.fna")
    multips(calculate_diversity, parallel, tmp_fa_files, tmp_core_seq_path,
            species)

    calculated_core_diversity = tmp_average_core_diversity(tmp_core_seq_path)
    refined_core_diversity = round(
        (0.1 + factor_core_diversity * calculated_core_diversity) /
        (1 + factor_core_diversity * calculated_core_diversity), 4)
    print('factor used: ' + str(factor_core_diversity))
    print('average core genome diversity: ' + str(calculated_core_diversity))
    print(
        'defined core genome diversity cutoff for splitting long branches: ' +
        str(refined_core_diversity))

    ## move folder tmp_core to the central data folder
    new_clustering_path = '%stmp_core' % path
    if os.path.exists(new_clustering_path):
        os.system(''.join(['rm -r ', new_clustering_path]))
    os.system('mv %s %s' % (tmp_core_seq_path, path))
    return calculated_core_diversity, refined_core_diversity