def create_RNACluster_fa(path, folders_dict):
    """
        input: '.fna', '_RNA_nuc_dict.cpk', 'allclusters.cpk'
        output: '.aln', 'tree.json', etc
    """
    RNA_path = folders_dict['RNA_path']
    RNA_dict = load_pickle('%s%s' % (RNA_path, 'all_RNA_seq.cpk'))

    ## load RNA cluster cpk file
    diamond_RNACluster_dt = load_pickle(RNA_path + 'allclusters.cpk')

    ## load RNAID_to_RNASeqID RNASeqID cpk file
    RNAID_to_RNASeqID_dict = load_pickle(path + 'RNAID_to_SeqID.cpk')

    ## create cluster-RNAs fasta files (By default: put RNAs in geneCluster folder)
    fasta_path = path + 'geneCluster/'
    ## diamond_RNACluster_dt: {clusterID:[ count_strains,[memb1,...],count_RNAs }
    for clusterID, RNA in diamond_RNACluster_dt.iteritems():
        ## RNACluster file name
        RNA_cluster_nu_filename = "%s%s" % (clusterID, '.fna')
        RNA_cluster_nu_write = open(fasta_path + RNA_cluster_nu_filename, 'wb')
        ## write nucleotide/amino_acid sequences into RNACluster files
        for RNA_memb in RNA[1]:
            ## RNA_name format: strain_1|locusTag
            strain_name = RNA_memb.split('|')[0]
            RNA_memb_seq = str(RNA_dict[strain_name][RNA_memb])
            RNASeqID = RNAID_to_RNASeqID_dict[RNA_memb]
            write_in_fa(RNA_cluster_nu_write, RNASeqID, RNA_memb_seq)
        RNA_cluster_nu_write.close()
    return diamond_RNACluster_dt
def create_geneCluster_fa(path,folders_dict):
    """ dict storing amino_acid Id/Seq from '.faa' files
        input: '.faa', '_gene_nuc_dict.cpk', 'allclusters.cpk'
        output:
    """
    ## make sure the geneCluster folder is empty
    os.system('rm -rf %s'%(path+'geneCluster/'))

    clustering_path= folders_dict['clustering_path']
    geneCluster_dt= load_pickle(clustering_path+'allclusters.cpk')
    protein_path= folders_dict['protein_path']
    nucleotide_path= folders_dict['nucleotide_path']

    geneID_to_geneSeqID_dict=load_pickle(path+'geneID_to_geneSeqID.cpk')
    gene_aa_dict= load_pickle('%s%s'%(protein_path,'all_protein_seq.cpk'))
    gene_na_dict= load_pickle('%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk'))

    ## create cluster-genes fasta files
    cluster_seqs_path=path+'geneCluster/'
    os.system('mkdir '+cluster_seqs_path)

    ## write nuc/aa sequences for each cluster
    for clusterID, gene in geneCluster_dt.iteritems():
        ## geneCluster file name
        gene_cluster_nu_filename="%s%s"%(clusterID,'.fna')
        gene_cluster_aa_filename="%s%s"%(clusterID,'.faa')
        with open( cluster_seqs_path+gene_cluster_nu_filename, 'wb') as gene_cluster_nu_write, \
            open( cluster_seqs_path+gene_cluster_aa_filename, 'wb') as gene_cluster_aa_write:
            ## write nucleotide/amino_acid sequences into geneCluster files
            for gene_memb in gene[1]:
                ## gene_name format: strain_1|locusTag
                strain_name= gene_memb.split('|')[0]
                geneSeqID=geneID_to_geneSeqID_dict[gene_memb]
                write_in_fa(gene_cluster_nu_write, geneSeqID, gene_na_dict[strain_name][gene_memb] )
                write_in_fa(gene_cluster_aa_write, geneSeqID, gene_aa_dict[strain_name][gene_memb])
def create_RNACluster_fa(path,folders_dict):
    """
        input: '.fna', '_RNA_nuc_dict.cpk', 'allclusters.cpk'
        output: '.aln', 'tree.json', etc
    """
    RNA_path= folders_dict['RNA_path']
    RNA_dict= load_pickle('%s%s'%(RNA_path,'all_RNA_seq.cpk'))

    ## load RNA cluster cpk file
    diamond_RNACluster_dt=load_pickle(RNA_path+'allclusters.cpk')

    ## load RNAID_to_RNASeqID RNASeqID cpk file
    RNAID_to_RNASeqID_dict=load_pickle(path+'RNAID_to_SeqID.cpk')

    ## create cluster-RNAs fasta files (By default: put RNAs in geneCluster folder)
    fasta_path=path+'geneCluster/';
    ## diamond_RNACluster_dt: {clusterID:[ count_strains,[memb1,...],count_RNAs }
    for clusterID, RNA in diamond_RNACluster_dt.iteritems():
        ## RNACluster file name
        RNA_cluster_nu_filename="%s%s"%(clusterID,'.fna')
        RNA_cluster_nu_write=open( fasta_path+RNA_cluster_nu_filename, 'wb')
        ## write nucleotide/amino_acid sequences into RNACluster files
        for RNA_memb in RNA[1]:
            ## RNA_name format: strain_1|locusTag
            strain_name= RNA_memb.split('|')[0]
            RNA_memb_seq=str(RNA_dict[strain_name][RNA_memb])
            RNASeqID=RNAID_to_RNASeqID_dict[RNA_memb]
            write_in_fa(RNA_cluster_nu_write, RNASeqID, RNA_memb_seq )
        RNA_cluster_nu_write.close()
    return diamond_RNACluster_dt
def create_geneCluster_fa(path,folders_dict):
    """ dict storing amino_acid Id/Seq from '.faa' files
        input: '.faa', '_gene_nuc_dict.cpk', 'allclusters.cpk'
        output:
    """
    ## make sure the geneCluster folder is empty
    os.system('rm -rf %s'%(path+'geneCluster/'))

    clustering_path= folders_dict['clustering_path']
    geneCluster_dt= load_pickle(clustering_path+'allclusters.cpk')
    protein_path= folders_dict['protein_path']
    nucleotide_path= folders_dict['nucleotide_path']

    geneID_to_geneSeqID_dict=load_pickle(path+'geneID_to_geneSeqID.cpk')
    gene_aa_dict= load_pickle('%s%s'%(protein_path,'all_protein_seq.cpk'))
    gene_na_dict= load_pickle('%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk'))

    ## create cluster-genes fasta files
    cluster_seqs_path=path+'geneCluster/'
    os.system('mkdir '+cluster_seqs_path)

    ## write nuc/aa sequences for each cluster
    for clusterID, gene in geneCluster_dt.iteritems():
        ## geneCluster file name
        gene_cluster_nu_filename="%s%s"%(clusterID,'.fna')
        gene_cluster_aa_filename="%s%s"%(clusterID,'.faa')
        with open( cluster_seqs_path+gene_cluster_nu_filename, 'wb') as gene_cluster_nu_write, \
            open( cluster_seqs_path+gene_cluster_aa_filename, 'wb') as gene_cluster_aa_write:
            ## write nucleotide/amino_acid sequences into geneCluster files
            for gene_memb in gene[1]:
                ## gene_name format: strain_1|locusTag
                strain_name= gene_memb.split('|')[0]
                geneSeqID=geneID_to_geneSeqID_dict[gene_memb]
                write_in_fa(gene_cluster_nu_write, geneSeqID, gene_na_dict[strain_name][gene_memb] )
                write_in_fa(gene_cluster_aa_write, geneSeqID, gene_aa_dict[strain_name][gene_memb])
def RNAclusters_align_makeTree(path, folders_dict, parallel, simple_tree):
    """
    create RNA clusters as nucleotide fasta files
    and build individual RNA trees based on fna files
    """

    diamond_RNACluster_dt = create_RNACluster_fa(path, folders_dict)

    ## align, build_tree, make_RNATree_json
    fasta_path = path + 'geneCluster/'
    fa_files = glob.glob(fasta_path + "*RC*.fna")
    multips(single_RNACluster_align_and_makeTree, parallel, fa_files,
            fasta_path, simple_tree)
    ## add RNA cluster in diamond_geneCluster_dt
    ### load gene cluster
    geneClusterPath = '%s%s' % (path, 'protein_faa/diamond_matches/')
    os.system(
        'cp %sallclusters_postprocessed.cpk %s/allclusters_postprocessed.cpk.bk '
        % (geneClusterPath, geneClusterPath))
    diamond_geneCluster_dt = load_pickle(geneClusterPath +
                                         'allclusters_postprocessed.cpk')
    ### update gene cluster with RNA cluster
    update_gene_cluster_with_RNA(path, diamond_RNACluster_dt,
                                 diamond_geneCluster_dt)
    ### update diversity file
    update_diversity_cpk(path)
def postprocess_paralogs_iterative(parallel,
                                   path,
                                   nstrains,
                                   simple_tree,
                                   paralog_branch_cutoff,
                                   disable_long_branch_splitting,
                                   paralog_frac_cutoff=0.3,
                                   plot=0):

    cluster_path = path + 'protein_faa/diamond_matches/'
    clusters_need_split = 'allclusters_postprocessed.cpk' if not disable_long_branch_splitting else 'allclusters.cpk'
    geneCluster_dt = load_pickle(cluster_path + clusters_need_split)
    ## folder that contains old split clusters in paralog splitting step
    geneClusters_fpath = path + 'geneCluster/'
    os.system('mkdir ' + geneClusters_fpath + 'paralog_splits/')
    if os.path.exists(''.join(
        [geneClusters_fpath, 'old_clusters_paralogSplit.txt'])):
        os.system(''.join(
            ['rm ', geneClusters_fpath, 'old_clusters_paralogSplit.txt']))

    split_result = postprocess_paralogs(
        parallel,
        path,
        nstrains,
        simple_tree,
        geneCluster_dt,
        set(),
        paralog_branch_cutoff=paralog_branch_cutoff,
        paralog_frac_cutoff=paralog_frac_cutoff,
        plot=0)
    n_split_clusters, new_fa_files_set = split_result
    iteration = 0
    while (n_split_clusters):
        print '---- split a total of ', n_split_clusters, 'in iteration', iteration
        split_result = postprocess_paralogs(
            parallel,
            path,
            nstrains,
            simple_tree,
            geneCluster_dt,
            new_fa_files_set,
            paralog_branch_cutoff=paralog_branch_cutoff,
            paralog_frac_cutoff=paralog_frac_cutoff,
            plot=plot)
        n_split_clusters, new_fa_files_set = split_result
        iteration += 1

    ## write gene_diversity_Dt cpk file
    update_diversity_cpk(path)

    ## remove old gene cluster and create new split cluster
    update_geneCluster_cpk(path, geneCluster_dt)

    if os.path.exists(''.join(
        [geneClusters_fpath, 'old_clusters_paralogSplit.txt'])):
        with open(geneClusters_fpath + 'old_clusters_paralogSplit.txt',
                  'r') as delete_cluster_file:
            deleted_file_count = len([clus for clus in delete_cluster_file])
            print '#clusters split during the checking paralogy:', deleted_file_count
def update_geneCluster_dt(path,geneCluster_dt):
    """
    add new cluster statistics in folder update_long_branch_splits
    geneCluster_dt: geneCluster dict to be updated
    """
    update_long_branch_splits=''.join([path,'geneCluster/update_long_branch_splits/'])
    for ifile in glob.iglob(update_long_branch_splits+'*.cpk'):
        for k,v in load_pickle(ifile).iteritems():
            #print('adding newly split clusters %s'%k)
            geneCluster_dt[k] = v
def load_sorted_clusters(path):
    '''
    load gene clusters and sort 1st by abundance and then by clusterID
    '''
    geneClusterPath='%s%s'%(path,'protein_faa/diamond_matches/')
    geneCluster_dt=load_pickle(geneClusterPath+'allclusters_postprocessed.cpk')
    from operator import itemgetter
    # sort by decreasing abundance (-v[0], minus to achieve decreasing)
    # followed by increasing strain count
    return sorted(geneCluster_dt.iteritems(),
                key=lambda (k,v): (-itemgetter(0)(v),itemgetter(2)(v)), reverse=False)
def update_geneCluster_dt(path, geneCluster_dt):
    """
    add new cluster statistics in folder update_long_branch_splits
    geneCluster_dt: geneCluster dict to be updated
    """
    update_long_branch_splits = ''.join(
        [path, 'geneCluster/update_long_branch_splits/'])
    for ifile in glob.iglob(update_long_branch_splits + '*.cpk'):
        for k, v in load_pickle(ifile).iteritems():
            #print('adding newly split clusters %s'%k)
            geneCluster_dt[k] = v
def load_sorted_clusters(path):
    '''
    load gene clusters and sort 1st by abundance and then by clusterID
    '''
    geneClusterPath='%s%s'%(path,'protein_faa/diamond_matches/')
    geneCluster_dt=load_pickle(geneClusterPath+'allclusters_postprocessed.cpk')
    from operator import itemgetter
    # sort by decreasing abundance (-v[0], minus to achieve decreasing)
    # followed by increasing strain count
    return sorted(geneCluster_dt.iteritems(),
                key=lambda (k,v): (-itemgetter(0)(v),itemgetter(2)(v)), reverse=False)
def make_genepresence_alignment(path, disable_gain_loss,
                                merged_gain_loss_output):
    '''
    loop over all gene clusters and append 0/1 to strain specific
    string used as pseudo alignment of gene presence absence
    '''
    geneClusterPath = '%s%s' % (path, 'protein_fna/diamond_matches/')
    output_path = '%s%s' % (path, 'geneCluster/')

    ## load strain list and prepare for gene presence/absence
    strain_list = load_pickle('%s%s' % (path, 'strain_list.cpk'))
    set_totalStrain = set([istrain for istrain in strain_list])
    totalStrain = len(set_totalStrain)
    dt_strainGene = defaultdict(str)

    sorted_genelist = load_sorted_clusters(path)
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for clusterID, gene in sorted_genelist:
        ## append 0/1 to each strain
        create_genePresence(dt_strainGene, totalStrain, set_totalStrain,
                            gene[1])

    with open('%s%s' % (output_path, 'genePresence.aln'),
              'wb') as presence_outfile:
        for istkey in dt_strainGene:
            write_in_fa(presence_outfile, istkey, dt_strainGene[istkey])
    write_pickle('%s%s' % (output_path, 'dt_genePresence.cpk'), dt_strainGene)

    if disable_gain_loss:
        geneEvents_dt = {i: 0 for i in range(len(sorted_genelist))}
        write_pickle('%s%s' % (output_path, 'dt_geneEvents.cpk'),
                     geneEvents_dt)
        if merged_gain_loss_output:
            gene_loss_fname = '%s%s' % (output_path, 'geneGainLossEvent.json')
            write_json(dt_strainGene, gene_loss_fname, indent=1)
        else:
            ## strainID as key, presence pattern as value (converted into np.array)
            keylist = dt_strainGene.keys()
            keylist.sort()
            strainID_keymap = {ind: k
                               for ind, k in enumerate(keylist)
                               }  # dict(zip(keylist, range(3)))
            presence_arr = np.array([
                np.array(dt_strainGene[k], 'c') for k in keylist
            ])  # 0: present, 3: absent
            presence_arr[presence_arr == '1'] = '3'
            for ind, (clusterID, gene) in enumerate(sorted_genelist):
                pattern_dt = {
                    strainID_keymap[strain_ind]: str(patt)
                    for strain_ind, patt in enumerate(presence_arr[:, ind])
                }
                pattern_fname = '%s%s_patterns.json' % (output_path, clusterID)
                write_json(pattern_dt, pattern_fname, indent=1)
def integrate_clusters(clustering_path, cluster_fpath):
    """ integrate all clusters """
    ## representative ID as key, original gene IDs as value
    representative_to_origin_dict=defaultdict()
    for idict in glob.iglob(clustering_path+"*_dicts.cpk"):
        subproblem_run_number=idict.split('/')[-1].split('subproblem_')[1].split('_')[0]
        representative_to_origin_dict[subproblem_run_number]=load_pickle(idict)
    with open('%s%s'%(clustering_path,'subproblem_finalRound_cluster.output'))\
                                                    as finalRound_cluster,\
        open(cluster_fpath,'wb') as integrated_cluster:
            for iline in finalRound_cluster:
                integrated_cluster.write('%s\n'%'\t'.join([geneID
                                    for representativeID in iline.rstrip().split('\t') \
                                    for geneID in representative_to_origin_dict[representativeID.split('GCs')[1].split('_')[0]][representativeID]
                                    ]))
def integrate_clusters(clustering_path, cluster_fpath):
    """ integrate all clusters """
    ## representative ID as key, original gene IDs as value
    representative_to_origin_dict = defaultdict()
    for idict in glob.iglob(clustering_path + "*_dicts.cpk"):
        subproblem_run_number = idict.split('/')[-1].split(
            'subproblem_')[1].split('_')[0]
        representative_to_origin_dict[subproblem_run_number] = load_pickle(
            idict)
    with open('%s%s'%(clustering_path,'subproblem_finalRound_cluster.output'))\
                                                    as finalRound_cluster,\
        open(cluster_fpath,'wb') as integrated_cluster:
        for iline in finalRound_cluster:
            integrated_cluster.write('%s\n'%'\t'.join([geneID
                                for representativeID in iline.rstrip().split('\t') \
                                for geneID in representative_to_origin_dict[representativeID.split('GCs')[1].split('_')[0]][representativeID]
                                ]))
Exemplo n.º 14
0
def make_genepresence_alignment(path, disable_gain_loss, merged_gain_loss_output):
    '''
    loop over all gene clusters and append 0/1 to strain specific
    string used as pseudo alignment of gene presence absence
    '''
    geneClusterPath='%s%s'%(path,'protein_fna/diamond_matches/')
    output_path='%s%s'%(path,'geneCluster/');

    ## load strain list and prepare for gene presence/absence
    strain_list= load_pickle('%s%s'%(path,'strain_list.cpk'))
    set_totalStrain=set([ istrain for istrain in strain_list ])
    totalStrain=len(set_totalStrain)
    dt_strainGene= defaultdict(str)

    sorted_genelist = load_sorted_clusters(path)
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for clusterID, gene in sorted_genelist:
        ## append 0/1 to each strain
        create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene[1])

    with open('%s%s'%(output_path,'genePresence.aln'),'wb') as presence_outfile:
        for istkey in dt_strainGene:
            write_in_fa( presence_outfile, istkey, dt_strainGene[istkey])
    write_pickle('%s%s'%(output_path,'dt_genePresence.cpk'), dt_strainGene)

    if disable_gain_loss:
        geneEvents_dt={ i:0 for i in range(len(sorted_genelist)) }
        write_pickle('%s%s'%(output_path,'dt_geneEvents.cpk'), geneEvents_dt)
        if merged_gain_loss_output:
            gene_loss_fname='%s%s'%(output_path,'geneGainLossEvent.json')
            write_json(dt_strainGene, gene_loss_fname, indent=1)
        else:
            ## strainID as key, presence pattern as value (converted into np.array)
            keylist= dt_strainGene.keys(); keylist.sort()
            strainID_keymap= {ind:k for ind, k in enumerate(keylist)} # dict(zip(keylist, range(3)))
            presence_arr= np.array([ np.array(dt_strainGene[k],'c') for k in keylist]) # 0: present, 3: absent
            presence_arr[presence_arr=='1']='3'
            for ind, (clusterID, gene) in enumerate(sorted_genelist):
                pattern_dt= { strainID_keymap[strain_ind]:str(patt) for strain_ind, patt in enumerate(presence_arr[:, ind])}
                pattern_fname='%s%s_patterns.json'%(output_path,clusterID)
                write_json(pattern_dt, pattern_fname, indent=1)
def RNAclusters_align_makeTree( path, folders_dict, parallel, simple_tree ):
    """
    create RNA clusters as nucleotide fasta files
    and build individual RNA trees based on fna files
    """

    diamond_RNACluster_dt=create_RNACluster_fa(path,folders_dict)

    ## align, build_tree, make_RNATree_json
    fasta_path = path+'geneCluster/'
    fa_files=glob.glob(fasta_path+"*RC*.fna")
    multips(single_RNACluster_align_and_makeTree, parallel, fa_files, fasta_path, simple_tree)
    ## add RNA cluster in diamond_geneCluster_dt
    ### load gene cluster
    geneClusterPath='%s%s'%(path,'protein_faa/diamond_matches/')
    os.system('cp %sallclusters_postprocessed.cpk %s/allclusters_postprocessed.cpk.bk '%(geneClusterPath,geneClusterPath))
    diamond_geneCluster_dt=load_pickle(geneClusterPath+'allclusters_postprocessed.cpk')
    ### update gene cluster with RNA cluster
    update_gene_cluster_with_RNA(path, diamond_RNACluster_dt, diamond_geneCluster_dt)
    ### update diversity file
    update_diversity_cpk(path)
    def __init__(self, **kwargs):
        for k, v in kwargs.iteritems():
            setattr(self, k, v)
            #self.params_dict[k]=v todos
        self.folders_dict=defaultdict( str,
            gbk_path='input_GenBank/',
            nucleotide_path='nucleotide_fna/',
            protein_path='protein_faa/',
            clustering_path='protein_faa/diamond_matches/',
            RNA_path='RNA_fna/',
            cluster_seq_path='geneCluster/',
            tmp_core_seq_path='tmp_core/',
            vis_json_path='vis/',
            vis_cluster_path='vis/geneCluster/',
            log_path='log/')

        # set up folder structure and files names
        self.organize_folders()
        self.specify_filepath()
        if os.path.exists(self.fpaths_dict['strain_cpk']):
            self.strain_list=load_pickle(self.fpaths_dict['strain_cpk'])
            self.nstrains=len(self.strain_list)
Exemplo n.º 17
0
    def __init__(self, **kwargs):
        for k, v in kwargs.iteritems():
            setattr(self, k, v)
            #self.params_dict[k]=v todos
        self.folders_dict = defaultdict(
            str,
            gbk_path='input_GenBank/',
            nucleotide_path='nucleotide_fna/',
            protein_path='protein_faa/',
            clustering_path='protein_faa/diamond_matches/',
            RNA_path='RNA_fna/',
            cluster_seq_path='geneCluster/',
            tmp_core_seq_path='tmp_core/',
            vis_json_path='vis/',
            vis_cluster_path='vis/geneCluster/',
            log_path='log/')

        # set up folder structure and files names
        self.organize_folders()
        self.specify_filepath()
        if os.path.exists(self.fpaths_dict['strain_cpk']):
            self.strain_list = load_pickle(self.fpaths_dict['strain_cpk'])
            self.nstrains = len(self.strain_list)
Exemplo n.º 18
0
def postprocess_paralogs_iterative(parallel, path, nstrains, simple_tree,
   	paralog_branch_cutoff, disable_long_branch_splitting, paralog_frac_cutoff=0.3, plot=0):

    cluster_path= path+'protein_faa/diamond_matches/'
    clusters_need_split='allclusters_postprocessed.cpk' if not disable_long_branch_splitting else 'allclusters.cpk'
    geneCluster_dt=load_pickle(cluster_path+clusters_need_split)
    ## folder that contains old split clusters in paralog splitting step
    geneClusters_fpath=path+'geneCluster/'
    os.system('mkdir '+geneClusters_fpath+'paralog_splits/')
    if os.path.exists(''.join([geneClusters_fpath,'old_clusters_paralogSplit.txt'])):
        os.system(''.join(['rm ',geneClusters_fpath,'old_clusters_paralogSplit.txt']))

    split_result= postprocess_paralogs( parallel, path, nstrains, simple_tree,
                                            geneCluster_dt, set(),
                                            paralog_branch_cutoff=paralog_branch_cutoff,
                                            paralog_frac_cutoff=paralog_frac_cutoff, plot=0)
    n_split_clusters, new_fa_files_set = split_result
    iteration=0
    while(n_split_clusters):
        print '---- split a total of ',n_split_clusters, 'in iteration', iteration
        split_result= postprocess_paralogs( parallel, path, nstrains, simple_tree,
                                                geneCluster_dt, new_fa_files_set,
                                                paralog_branch_cutoff=paralog_branch_cutoff,
                                                paralog_frac_cutoff=paralog_frac_cutoff, plot=plot)
        n_split_clusters, new_fa_files_set = split_result
        iteration+=1

    ## write gene_diversity_Dt cpk file
    update_diversity_cpk(path)

    ## remove old gene cluster and create new split cluster
    update_geneCluster_cpk(path, geneCluster_dt)

    if os.path.exists(''.join([geneClusters_fpath,'old_clusters_paralogSplit.txt'])):
        with open(geneClusters_fpath+'old_clusters_paralogSplit.txt', 'r') as delete_cluster_file:
            deleted_file_count=len([ clus for clus in delete_cluster_file ])
            print '#clusters split during the checking paralogy:',deleted_file_count
def create_core_SNP_matrix(path,
                           core_cutoff=1.0,
                           core_gene_strain_fpath=''):  #1.0
    """ create SNP matrix using core gene SNPs
        input: strain_list.cpk, core_geneList.cpk
        output: SNP_whole_matrix.aln
        core_cutoff: percentage of strains used to decide whether a gene is core
            default: 1.0 (strictly core gene, which is present in all strains)
            customized: 0.9 ( soft core, considered as core if present in 90% of strains)
    """
    import os, sys, operator
    import numpy as np
    import numpy.ma as ma
    from collections import defaultdict
    from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa

    alnFilePath = '%s%s' % (path, 'geneCluster/')
    output_path = alnFilePath

    ## create core gene list
    corelist = []
    strain_list = load_pickle(path + 'strain_list.cpk')
    totalStrain = len(strain_list)
    sorted_geneList = load_sorted_clusters(path)
    if core_gene_strain_fpath != '':
        with open(core_gene_strain_fpath, 'rb') as core_gene_strain_file:
            core_strain_set = set(
                [i.rstrip().replace('-', '_') for i in core_gene_strain_file])
    with open(output_path + 'core_geneList.txt', 'wb') as outfile:
        for clusterID, vg in sorted_geneList:
            if core_cutoff == 1.0:
                strain_core_cutoff = totalStrain
            else:
                strain_core_cutoff = int(totalStrain * core_cutoff)
            if vg[0] == vg[2] and vg[0] >= strain_core_cutoff:
                coreGeneName = '%s%s' % (clusterID, '_na_aln.fa')
                ## sequences might be discarded because of premature stops
                coreGeneName_path = alnFilePath + coreGeneName
                if os.path.exists(coreGeneName_path) and len(
                        read_fasta(coreGeneName_path)) >= strain_core_cutoff:
                    if core_gene_strain_fpath != '' and len(
                            core_strain_set -
                            set([i.split('|')[0] for i in vg[1]])) != 0:
                        continue
                    outfile.write(coreGeneName + '\n')
                    corelist.append(coreGeneName)
                else:
                    #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene')
                    pass

        write_pickle(output_path + 'core_geneList.cpk', corelist)

    refSeqList = load_pickle(path + 'strain_list.cpk')
    refSeqList.sort()

    snp_fre_lst = []
    snp_wh_matrix_flag = 0
    snp_pos_dt = defaultdict(list)
    snp_whole_matrix = np.array([])
    snps_by_gene = []
    for align_file in corelist:  ## core genes
        nuc_array = np.array([])  # array to store nucleotides for each gene
        gene_seq_dt = read_fasta(alnFilePath + align_file)
        if core_cutoff != 1.0:
            # set sequences for missing gene (space*gene_length)
            missing_gene_seq = ' ' * len(gene_seq_dt.values()[0])
            totalStrain_sorted_lst = sorted(strain_list)
        # build strain_seq_dt from gene_seq_dt
        strain_seq_dt = defaultdict()
        for gene, seq in gene_seq_dt.iteritems():
            strain_seq_dt[gene.split('-')[0]] = seq  # strain-locus_tag-...
        strain_seq_sorted_lst = sorted(strain_seq_dt.items(),
                                       key=lambda x: x[0])

        start_flag = 0
        if core_cutoff == 1.0:
            for ka, va in strain_seq_sorted_lst:
                if start_flag == 0:
                    nuc_array = np.array(np.fromstring(va, dtype='S1'))
                    start_flag = 1
                else:
                    nuc_array = np.vstack(
                        (nuc_array, np.fromstring(va, dtype='S1')))
            ## find SNP positions
            position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis=0)
            position_has_gap = np.any(nuc_array == '-', axis=0)
            position_SNP = position_polymorphic & (~position_has_gap)
            snp_columns = nuc_array[:, position_SNP]
            snp_pos_dt[align_file] = np.where(position_SNP)[0]
        else:
            ## add '-' for missing genes when dealing with soft core genes
            core_gene_strain = [gene for gene in strain_seq_dt.keys()]
            for strain in totalStrain_sorted_lst:
                if start_flag == 0:
                    if strain in core_gene_strain:
                        nuc_array = np.array(
                            np.fromstring(strain_seq_dt[strain], dtype='S1'))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s' % (
                            strain, align_file)
                        nuc_array = np.array(
                            np.fromstring(missing_gene_seq, dtype='S1'))
                    start_flag = 1
                else:
                    if strain in core_gene_strain:
                        nuc_array = np.vstack(
                            (nuc_array,
                             np.fromstring(strain_seq_dt[strain], dtype='S1')))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s' % (
                            strain, align_file)
                        nuc_array = np.vstack((nuc_array,
                                               np.fromstring(missing_gene_seq,
                                                             dtype='S1')))
            ## find SNP positions
            ## mask missing genes -- determine rows that have ' ' in every column
            is_missing = np.all(nuc_array == ' ', axis=1)
            masked_non_missing_array = np.ma.masked_array(
                nuc_array, nuc_array == ' ')
            position_polymorphic = np.any(
                masked_non_missing_array != masked_non_missing_array[0, :],
                axis=0)
            position_has_gap = np.any(masked_non_missing_array == '-', axis=0)
            position_SNP = position_polymorphic & (~position_has_gap)
            # the below seems duplicated from 5 lines above??
            if is_missing.sum() > 0:  # with missing genes
                nuc_array[is_missing] = '-'
            snp_columns = nuc_array[:, position_SNP]
            snp_pos_dt[align_file] = np.where(position_SNP)[0]
            #print snp_columns

        if snp_wh_matrix_flag == 0:
            snp_whole_matrix = snp_columns
            snp_wh_matrix_flag = 1
        else:
            snp_whole_matrix = np.hstack((snp_whole_matrix, snp_columns))
    write_pickle(output_path + 'snp_pos.cpk', snp_pos_dt)

    with open(output_path + 'SNP_whole_matrix.aln', 'wb') as outfile:
        for ind, isw in enumerate(snp_whole_matrix):
            write_in_fa(outfile, refSeqList[ind], isw.tostring())
def create_core_SNP_matrix(path, core_cutoff=1.0, core_gene_strain_fpath=''):#1.0
    """ create SNP matrix using core gene SNPs
        input: strain_list.cpk, core_geneList.cpk
        output: SNP_whole_matrix.aln
        core_cutoff: percentage of strains used to decide whether a gene is core
            default: 1.0 (strictly core gene, which is present in all strains)
            customized: 0.9 ( soft core, considered as core if present in 90% of strains)
    """
    import os,sys,operator
    import numpy as np
    import numpy.ma as ma
    from collections import defaultdict
    from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa

    alnFilePath='%s%s'%(path,'geneCluster/')
    output_path= alnFilePath

    ## create core gene list
    corelist=[]
    strain_list=load_pickle(path+'strain_list.cpk')
    totalStrain= len(strain_list)
    sorted_geneList = load_sorted_clusters(path)
    if core_gene_strain_fpath!='':
        with open(core_gene_strain_fpath,'rb') as core_gene_strain_file:
            core_strain_set= set([i.rstrip().replace('-','_') for i in core_gene_strain_file])
    with open(output_path+'core_geneList.txt','wb') as outfile:
        for clusterID, vg in sorted_geneList:
            if core_cutoff==1.0:
                strain_core_cutoff=totalStrain
            else:
                strain_core_cutoff=int(totalStrain*core_cutoff)
            if vg[0]==vg[2] and vg[0]>=strain_core_cutoff:
                coreGeneName='%s%s'%(clusterID,'_na_aln.fa')
                ## sequences might be discarded because of premature stops
                coreGeneName_path= alnFilePath+coreGeneName
                if os.path.exists(coreGeneName_path) and len(read_fasta(coreGeneName_path)) >= strain_core_cutoff:
                    if core_gene_strain_fpath!='' and len(core_strain_set-set([i.split('|')[0] for i in vg[1]]))!=0:
                        continue
                    outfile.write(coreGeneName+'\n')
                    corelist.append(coreGeneName)
                else:
                    #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene')
                    pass

        write_pickle(output_path+'core_geneList.cpk',corelist)

    refSeqList=load_pickle(path+'strain_list.cpk');refSeqList.sort()

    snp_fre_lst=[]; snp_wh_matrix_flag=0
    snp_pos_dt=defaultdict(list); snp_whole_matrix=np.array([])
    snps_by_gene=[]
    for align_file in corelist:## core genes
        nuc_array=np.array([]) # array to store nucleotides for each gene
        gene_seq_dt=read_fasta(alnFilePath+align_file)
        if core_cutoff!=1.0:
            # set sequences for missing gene (space*gene_length)
            missing_gene_seq=' '*len(gene_seq_dt.values()[0])
            totalStrain_sorted_lst=sorted(strain_list)
        # build strain_seq_dt from gene_seq_dt
        strain_seq_dt=defaultdict()
        for gene, seq in gene_seq_dt.iteritems():
            strain_seq_dt[gene.split('-')[0]]=seq # strain-locus_tag-...
        strain_seq_sorted_lst=sorted(strain_seq_dt.items(), key=lambda x: x[0])

        start_flag=0
        if core_cutoff==1.0:
            for ka, va in strain_seq_sorted_lst:
                if start_flag==0:
                    nuc_array=np.array(np.fromstring(va, dtype='S1'))
                    start_flag=1
                else:
                    nuc_array=np.vstack((nuc_array,np.fromstring(va, dtype='S1')))
            ## find SNP positions
            position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis = 0)
            position_has_gap = np.any(nuc_array=='-', axis=0)
            position_SNP = position_polymorphic&(~position_has_gap)
            snp_columns = nuc_array[:,position_SNP]
            snp_pos_dt[align_file]=np.where(position_SNP)[0]
        else:
        ## add '-' for missing genes when dealing with soft core genes
            core_gene_strain=[ gene for gene in strain_seq_dt.keys()]
            for strain in totalStrain_sorted_lst:
                if start_flag==0:
                    if strain in core_gene_strain:
                        nuc_array=np.array(np.fromstring(strain_seq_dt[strain], dtype='S1'))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file)
                        nuc_array=np.array(np.fromstring(missing_gene_seq, dtype='S1'))
                    start_flag=1
                else:
                    if strain in core_gene_strain:
                        nuc_array=np.vstack((nuc_array,np.fromstring(strain_seq_dt[strain], dtype='S1')))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file)
                        nuc_array=np.vstack((nuc_array,np.fromstring(missing_gene_seq, dtype='S1')))
            ## find SNP positions
            ## mask missing genes -- determine rows that have ' ' in every column
            is_missing = np.all(nuc_array==' ',axis=1)
            masked_non_missing_array= np.ma.masked_array(nuc_array, nuc_array==' ')
            position_polymorphic = np.any(masked_non_missing_array!= masked_non_missing_array[0, :],axis = 0)
            position_has_gap = np.any(masked_non_missing_array=='-',axis=0)
            position_SNP = position_polymorphic&(~position_has_gap)
            # the below seems duplicated from 5 lines above??
            if is_missing.sum()>0: # with missing genes
                nuc_array[is_missing]='-'
            snp_columns = nuc_array[:,position_SNP]
            snp_pos_dt[align_file]=np.where(position_SNP)[0]
            #print snp_columns

        if snp_wh_matrix_flag==0:
            snp_whole_matrix=snp_columns;
            snp_wh_matrix_flag=1
        else:
            snp_whole_matrix=np.hstack((snp_whole_matrix, snp_columns))
    write_pickle(output_path+'snp_pos.cpk',snp_pos_dt)

    with open(output_path+'SNP_whole_matrix.aln','wb') as outfile:
        for ind, isw in enumerate(snp_whole_matrix):
            write_in_fa( outfile, refSeqList[ind], isw.tostring() )
def clustering_protein(path, folders_dict, threads,
    blast_fpath, roary_fpath, orthofinder_fpath, other_tool_fpath,
    diamond_evalue, diamond_max_target_seqs, diamond_identity,
    diamond_query_cover, diamond_subject_cover, diamond_path, mcl_inflation):
    '''
    Procedure: all-against-all protein comparison + hits filtering + mcl clustering
    By default: DIAMOND -> BS -> MCL
    Alternatives:
    1. Blastp output (user-provided) -> BS -> MCL
    2. Roary
    3. OrthoFinder
    4. Other tools.
    params:
        path:                    path to directory including data and output
        threads:                 number of parallel threads used to run diamond
        blast_fpath: gene clusters by all-vs-all blast
                                 comparison and other clusterings methods
        roary_fpath: gene clusters by roary
        diamond_max_target_seqs: Diamond setting: the maximum number of target sequences
                                  per query to keep alignments for. Defalut:600
                                  #strain * #max_duplication= 50*10= 500
    '''
    threads=str(threads)
    protein_path= folders_dict['protein_path']
    clustering_path= folders_dict['clustering_path']
    cluster_fpath= '%s%s'%(clustering_path,'allclusters.tsv')
    cluster_dt_cpk_fpath='%s%s'%(clustering_path,'allclusters.cpk')
    if any( i!='none' for i in [blast_fpath, roary_fpath, orthofinder_fpath, other_tool_fpath]):
        geneID_to_geneSeqID_dict= load_pickle('%sgeneID_to_geneSeqID.cpk'%path)
        locus_tag_to_geneID_dict= defaultdict(list)
        for geneID in geneID_to_geneSeqID_dict.keys():
            locus_tag=geneID.split('|')[1]
            locus_tag_to_geneID_dict[locus_tag]=geneID
    ## using standard pipeline (roary_fpath=='none')
    if all( i=='none' for i in [blast_fpath, roary_fpath, orthofinder_fpath, other_tool_fpath]):
        dmd_ref_file='reference.faa'
        ## prepare dmd_query_file (dmd_query_file is dmd_ref_file)
        os.system(''.join(['cat ',protein_path,'*faa > ',clustering_path,dmd_ref_file]))
        ## run diamond
        diamond_run(clustering_path, dmd_ref_file, threads, diamond_evalue,
            diamond_max_target_seqs, diamond_identity, diamond_query_cover, diamond_subject_cover, diamond_path )
        ## filtering hits via BS score
        filter_hits_single(clustering_path, threads)
        ## running mcl
        mcl_run(clustering_path, threads, mcl_inflation)
        ## clean up diamond_query_file
        os.system(''.join(['rm ',clustering_path,'*faa']))
    elif blast_fpath!='none': ## using user-given cluster file based on blast
            os.system(''.join(['cp ',blast_fpath,' ',clustering_path,'blastp.m8']))
            ## filtering hits via BS score
            filter_hits_single(clustering_path, threads, input_prefix='blastp')
            ## running mcl
            mcl_run(clustering_path, threads, mcl_inflation, input_prefix='blastp')
    elif roary_fpath!='none': ## using cluster files from roary
        roary_cluster_process(locus_tag_to_geneID_dict, roary_fpath, cluster_fpath)
        # with open(roary_fpath, 'rb') as cluster_external_file:
        #     with open(cluster_fpath, 'wb') as cluster_final_file:
        #         for cluster_line in cluster_external_file:
        #              cluster_final_file.write( '%s\n'%'\t'.join([ gene_tag.replace('_','|') if '|' not in gene_tag else gene_tag for gene_tag in cluster_line.rstrip().split(': ')[1].split('\t')]) )
    elif orthofinder_fpath!='none':
        process_orthofinder(orthofinder_fpath,cluster_fpath)
    elif other_tool_fpath!='none':
        os.system('cp %s %s'%(other_tool_fpath,cluster_fpath))
    cleanup_clustering(clustering_path)
    return parse_geneCluster(cluster_fpath, cluster_dt_cpk_fpath)
def estimate_core_gene_diversity(path, folders_dict, strain_list, parallel,
                                 core_cutoff, factor_core_diversity, species):
    """
    estimate core gene diversity before gene cluster alignment
    and cluster post-processing
    """
    totalStrain = len(strain_list)

    ## load clusters
    clustering_path = folders_dict['clustering_path']
    geneCluster_dt = load_pickle(clustering_path + 'allclusters.cpk')
    protein_path = folders_dict['protein_path']
    nucleotide_path = folders_dict['nucleotide_path']
    protein_dict_path = '%s%s' % (protein_path, 'all_protein_seq.cpk')
    nucleotide_dict_path = '%s%s' % (nucleotide_path, 'all_nucleotide_seq.cpk')
    tmp_core_seq_path = '%s%s' % (clustering_path, 'tmp_core/')
    ## load geneID_to_geneSeqID geneSeqID cpk file
    geneID_to_geneSeqID_dict = load_pickle(path + 'geneID_to_geneSeqID.cpk')

    ## create core gene list
    core_geneCluster_dt = defaultdict()
    # geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes }
    for clusterID, cluster_stats in geneCluster_dt.iteritems():
        if core_cutoff == 1.0:
            strain_core_cutoff = totalStrain
        else:
            strain_core_cutoff = int(totalStrain * core_cutoff)
        ## check whether #genes == #strains and it's a core/soft-core gene
        if cluster_stats[0] == cluster_stats[
                2] and cluster_stats[0] >= strain_core_cutoff:
            core_geneCluster_dt[clusterID] = cluster_stats
    if os.path.exists(tmp_core_seq_path):
        os.system(''.join(['rm -rf ', tmp_core_seq_path]))
    os.system('mkdir %s' % tmp_core_seq_path)

    ## create dict storing all genes' translation
    if 0:
        gene_aa_dict = defaultdict(dict)
        for accession_id in strain_list:
            gene_aa_dict[accession_id] = read_fasta(''.join(
                [protein_path, accession_id, '.faa']))
        write_pickle(protein_dict_path, gene_aa_dict)

        ## create dict for all gene's nucleotide sequence
        gene_na_dict = defaultdict(dict)
        for accession_id in strain_list:
            gene_na_dict[accession_id] = read_fasta(''.join(
                [nucleotide_path, accession_id, '.fna']))
        write_pickle(nucleotide_dict_path, gene_na_dict)

    gene_aa_dict = load_pickle(protein_dict_path)
    gene_na_dict = load_pickle(nucleotide_dict_path)

    ## write nucleotide and amino-acid sequences for each gene cluster
    export_cluster_seq_tmp(tmp_core_seq_path, core_geneCluster_dt,
                           geneID_to_geneSeqID_dict, gene_na_dict,
                           gene_aa_dict)

    tmp_fa_files = glob.glob(tmp_core_seq_path + "*.fna")
    multips(calculate_diversity, parallel, tmp_fa_files, tmp_core_seq_path,
            species)

    calculated_core_diversity = tmp_average_core_diversity(tmp_core_seq_path)
    refined_core_diversity = round(
        (0.1 + factor_core_diversity * calculated_core_diversity) /
        (1 + factor_core_diversity * calculated_core_diversity), 4)
    print('factor used: ' + str(factor_core_diversity))
    print('average core genome diversity: ' + str(calculated_core_diversity))
    print(
        'defined core genome diversity cutoff for splitting long branches: ' +
        str(refined_core_diversity))

    ## move folder tmp_core to the central data folder
    new_clustering_path = '%stmp_core' % path
    if os.path.exists(new_clustering_path):
        os.system(''.join(['rm -r ', new_clustering_path]))
    os.system('mv %s %s' % (tmp_core_seq_path, path))
    return calculated_core_diversity, refined_core_diversity
def postprocess_split_long_branch(parallel,
                                  path,
                                  simple_tree,
                                  cut_branch_threshold=0.3):
    """
    Split tree via breaking up long branches.
    Remote homology leads to over-clustering. This yields tree with long branches.
    """

    file_path = ''.join([path, 'geneCluster/'])
    new_split_folder = ''.join([file_path, 'update_long_branch_splits/'])
    if os.path.exists(new_split_folder):
        ## remove the folder from previous run
        os.system(''.join(['rm -r ', new_split_folder]))
    os.system(''.join(['mkdir ', new_split_folder]))
    deleted_clusters_folder = ''.join(
        [file_path, 'deleted_clusters_longSplit/'])
    if os.path.exists(deleted_clusters_folder):
        os.system(''.join(['rm -r ', deleted_clusters_folder]))
    os.system(''.join(['mkdir ', deleted_clusters_folder]))

    ## load clusters
    cluster_path = '%s%s' % (path, 'protein_faa/diamond_matches/')
    geneCluster_dt = load_pickle(cluster_path + 'allclusters.cpk')

    ## gather all trees generated before postprocessing
    tree_path = file_path
    tree_fname_list = glob.glob(tree_path + '*nwk')

    ## ensure that writing to new_clusters_longSplit starts at the beginning (for re-running)
    if os.path.exists(''.join([file_path, 'new_clusters_longSplit.txt'])):
        os.system(''.join(['rm ', file_path, 'new_clusters_longSplit.txt']))
    if os.path.exists(''.join([file_path, 'old_clusters_longSplit.txt'])):
        os.system(''.join(['rm ', file_path, 'old_clusters_longSplit.txt']))

    # =============================================
    # parallelization:
    # "post-clustering workflow for splitting trees on over-clustered records"
    treefile_used = True
    multips(cutTree_outputCluster, parallel, tree_fname_list, file_path,
            cut_branch_threshold, treefile_used)

    ## If new_clusters_longSplit.txt (over_split records) exists,
    ## then gather new clusters from new_clusters_longSplit.txt
    if os.path.exists(''.join([file_path, 'new_clusters_longSplit.txt'])):
        with open(file_path + 'new_clusters_longSplit.txt',
                  'rb') as new_clusters_longSplit:
            new_fa_files_list = [
                clus.rstrip() for clus in new_clusters_longSplit
            ]
            print '#times of splitting long branches:', len(
                new_fa_files_list) - 1
        with open(file_path + 'old_clusters_longSplit.txt',
                  'rb') as delete_cluster_file:
            deleted_file_count = len([clus for clus in delete_cluster_file])
            print '#clusters split during the checking of long branches:', deleted_file_count

        ## parallelization of "align and make tree on new cluster"
        multips(align_and_makeTree, parallel, new_fa_files_list, file_path,
                simple_tree)
        # =============================================

        ## delete original clusters which are split
        delete_original_clusters(file_path, geneCluster_dt)
        ## add newly split clusters
        update_geneCluster_dt(path, geneCluster_dt)
        ## write updated gene clusters in cpk file
        update_geneCluster_cpk(path, geneCluster_dt)
        ## write gene_diversity_Dt cpk file
        update_diversity_cpk(path)

        os.system(' '.join([
            'mv ', file_path + 'new_clusters_longSplit.txt',
            file_path + 'added_clusters_split_long.txt'
        ]))
        os.system(' '.join([
            'mv ', file_path + 'old_clusters_longSplit.txt',
            file_path + 'deleted_clusters_split_long.txt'
        ]))
    else:  # no clusters postprocessed
        os.system(' '.join([
            'cp', cluster_path + 'allclusters.cpk',
            cluster_path + 'allclusters_postprocessed.cpk'
        ]))
Exemplo n.º 24
0
def geneCluster_to_json(path, enable_RNA_clustering, store_locus_tag,
                        raw_locus_tag, optional_table_column):
    """
    create json file for gene cluster table visualzition
    input:  path to genecluster output
    output: geneCluster.json
    """
    # define path and make output directory
    geneCluster_path = '%s%s' % (path, 'geneCluster/')
    output_path = '%s%s' % (path, 'vis/')

    # open files
    geneClusterJSON_outfile = open(output_path + 'geneCluster.json', 'wb')
    ##store locus_tags in a separate file for large dataset
    if store_locus_tag:
        locus_tag_outfile = open(path + 'search_locus_tag.tsv', 'wb')

    ### load precomputed annotations, diversity, associations etc
    # load geneID_to_descriptions
    geneID_to_descriptions = load_pickle(path + 'geneID_to_description.cpk')

    if enable_RNA_clustering:
        # load RNAID_to_description_file
        geneID_to_descriptions.update(
            load_pickle(path + 'RNAID_to_description.cpk'))

    gene_diversity_Dt = load_pickle(geneCluster_path + 'gene_diversity.cpk')
    ## load gain/loss event count dictionary
    dt_geneEvents = load_pickle(geneCluster_path + 'dt_geneEvents.cpk')
    ## load association
    branch_associations_path = path + 'branch_association.cpk'
    if os.path.isfile(branch_associations_path):
        branch_associations = load_pickle(branch_associations_path)
    else:
        branch_associations = {}
    presence_absence_associations_path = path + 'presence_absence_association.cpk'
    if os.path.isfile(presence_absence_associations_path):
        presence_absence_associations = load_pickle(
            presence_absence_associations_path)
    else:
        presence_absence_associations = {}

    ## load list of clustered sorted by strain count
    sorted_genelist = load_sorted_clusters(path)

    geneClusterJSON_outfile.write('[')
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for gid, (clusterID, gene) in enumerate(sorted_genelist):
        strain_count, gene_list, gene_count = gene
        # #print strain_count, gene_count
        if gid != 0:  ## begin
            geneClusterJSON_outfile.write(',\n')

        ## annotation majority
        allAnn, majority_annotation = consolidate_annotation(
            path, gene_list, geneID_to_descriptions)

        ## geneName majority
        all_geneName, majority_geneName = consolidate_geneName(
            path, gene_list, geneID_to_descriptions)

        ## extract gain/loss event count
        gene_event = dt_geneEvents[gid]

        ## average length
        seqs = read_fasta(geneCluster_path + '%s%s' %
                          (clusterID, '.fna')).values()
        geneClusterLength = int(np.mean([len(igene) for igene in seqs]))

        ## msa
        #geneCluster_aln='%s%s'%(clusterID,'_aa.aln')
        geneCluster_aln = clusterID

        ## check for duplicates
        if gene_count > strain_count:
            duplicated_state = 'yes'
            dup_list = [ig.split('|')[0] for ig in gene_list]
            # "#" to delimit (gene/gene_count)key/value ; "@" to seperate genes
            # Counter({'g1': 2, 'g2': 1})
            dup_detail = ''.join([
                '%s#%s@' % (kd, vd)
                for kd, vd in Counter(dup_list).iteritems() if vd > 1
            ])[:-1]
        else:
            duplicated_state = 'no'
            dup_detail = ''

        ## locus_tag
        if raw_locus_tag:  # make a string of all locus tags [1] in igl.split('|')
            all_locus_tags = ' '.join([igl.split('|')[1] for igl in gene_list])
        else:  # in addition to locus tag, keep strain name (but replace '|')
            all_locus_tags = ' '.join(
                [igl.replace('|', '_') for igl in gene_list])

        ## optionally store locus tags to file, remove from geneClusterJSON
        if store_locus_tag:
            locus_tag_outfile.write('%s\t%s\n' % (clusterID, all_locus_tags))
            all_locus_tags = ''

        ## default cluster json fields
        cluster_json_line = [
            '"geneId":' + str(gid + 1), '"geneLen":' + str(geneClusterLength),
            '"count":' + str(strain_count), '"dupli":"' + duplicated_state +
            '"', '"dup_detail":"' + dup_detail + '"',
            '"ann":"' + majority_annotation + '"',
            '"msa":"' + geneCluster_aln + '"',
            '"divers":"' + gene_diversity_Dt[clusterID] + '"',
            '"event":"' + str(gene_event) + '"', '"allAnn":"' + allAnn + '"',
            '"GName":"' + majority_geneName + '"',
            '"allGName":"' + all_geneName + '"',
            '"locus":"' + all_locus_tags + '"'
        ]

        if optional_table_column:
            cluster_json_line.extend(
                optional_geneCluster_properties(gene_list,
                                                optional_table_column))
        if clusterID in branch_associations:
            cluster_json_line.extend(
                geneCluster_associations(branch_associations[clusterID],
                                         suffix='BA'))
        if clusterID in presence_absence_associations:
            cluster_json_line.extend(
                geneCluster_associations(
                    presence_absence_associations[clusterID], suffix='PA'))

        #write file
        cluster_json_line = ','.join(cluster_json_line)
        geneClusterJSON_outfile.write('{' + cluster_json_line + '}')

    # close files
    geneClusterJSON_outfile.write(']')
    geneClusterJSON_outfile.close()
    if store_locus_tag: locus_tag_outfile.close()
def clustering_protein(path, folders_dict, threads, blast_fpath, roary_fpath,
                       orthofinder_fpath, other_tool_fpath, diamond_evalue,
                       diamond_max_target_seqs, diamond_identity,
                       diamond_query_cover, diamond_subject_cover,
                       diamond_path, mcl_inflation):
    '''
    Procedure: all-against-all protein comparison + hits filtering + mcl clustering
    By default: DIAMOND -> BS -> MCL
    Alternatives:
    1. Blastp output (user-provided) -> BS -> MCL
    2. Roary
    3. OrthoFinder
    4. Other tools.
    params:
        path:                    path to directory including data and output
        threads:                 number of parallel threads used to run diamond
        blast_fpath: gene clusters by all-vs-all blast
                                 comparison and other clusterings methods
        roary_fpath: gene clusters by roary
        diamond_max_target_seqs: Diamond setting: the maximum number of target sequences
                                  per query to keep alignments for. Defalut:600
                                  #strain * #max_duplication= 50*10= 500
    '''
    threads = str(threads)
    protein_path = folders_dict['protein_path']
    clustering_path = folders_dict['clustering_path']
    cluster_fpath = '%s%s' % (clustering_path, 'allclusters.tsv')
    cluster_dt_cpk_fpath = '%s%s' % (clustering_path, 'allclusters.cpk')
    if any(i != 'none' for i in
           [blast_fpath, roary_fpath, orthofinder_fpath, other_tool_fpath]):
        geneID_to_geneSeqID_dict = load_pickle('%sgeneID_to_geneSeqID.cpk' %
                                               path)
        locus_tag_to_geneID_dict = defaultdict(list)
        for geneID in geneID_to_geneSeqID_dict.keys():
            locus_tag = geneID.split('|')[1]
            locus_tag_to_geneID_dict[locus_tag] = geneID
    ## using standard pipeline (roary_fpath=='none')
    if all(i == 'none' for i in
           [blast_fpath, roary_fpath, orthofinder_fpath, other_tool_fpath]):
        dmd_ref_file = 'reference.faa'
        ## prepare dmd_query_file (dmd_query_file is dmd_ref_file)
        os.system(''.join(
            ['cat ', protein_path, '*faa > ', clustering_path, dmd_ref_file]))
        ## run diamond
        diamond_run(clustering_path, dmd_ref_file, threads, diamond_evalue,
                    diamond_max_target_seqs, diamond_identity,
                    diamond_query_cover, diamond_subject_cover, diamond_path)
        ## filtering hits via BS score
        filter_hits_single(clustering_path, threads)
        ## running mcl
        mcl_run(clustering_path, threads, mcl_inflation)
        ## clean up diamond_query_file
        os.system(''.join(['rm ', clustering_path, '*faa']))
    elif blast_fpath != 'none':  ## using user-given cluster file based on blast
        os.system(''.join(
            ['cp ', blast_fpath, ' ', clustering_path, 'blastp.m8']))
        ## filtering hits via BS score
        filter_hits_single(clustering_path, threads, input_prefix='blastp')
        ## running mcl
        mcl_run(clustering_path, threads, mcl_inflation, input_prefix='blastp')
    elif roary_fpath != 'none':  ## using cluster files from roary
        roary_cluster_process(locus_tag_to_geneID_dict, roary_fpath,
                              cluster_fpath)
        # with open(roary_fpath, 'rb') as cluster_external_file:
        #     with open(cluster_fpath, 'wb') as cluster_final_file:
        #         for cluster_line in cluster_external_file:
        #              cluster_final_file.write( '%s\n'%'\t'.join([ gene_tag.replace('_','|') if '|' not in gene_tag else gene_tag for gene_tag in cluster_line.rstrip().split(': ')[1].split('\t')]) )
    elif orthofinder_fpath != 'none':
        process_orthofinder(orthofinder_fpath, cluster_fpath)
    elif other_tool_fpath != 'none':
        os.system('cp %s %s' % (other_tool_fpath, cluster_fpath))
    cleanup_clustering(clustering_path)
    return parse_geneCluster(cluster_fpath, cluster_dt_cpk_fpath)
def postprocess_unclustered_genes(parallel, path, nstrains, simple_tree, split_long_branch_cutoff,
    window_size_smoothed=5, strain_proportion=0.3 , sigma_scale=3):
    """
        1) detect suspicious peaks in the distribution of average length of genes in gene cluster (aa count)
           np.bincount([1,2,3,34,3]) -> how often each entry is found
           np.convolve([1,1,1,1,1], gene_length_count)
          ->  unclustered genes will contribute many small clusters (size 1)
              that result in peaks in the distribution
        2) for each peak detected, align the sequences of all genes in clusters in peak
        3) to cluster aligned genes, build tree. However, to ensure long branches
          ->  between unaligned sub-alignment, fill gaps with random sequence (skipped, not tested)
              importantly, this random sequence needs to be the same in different columns of the alignment.
                  - random sequence a la rseq = [alpha[ii] for ii in np.random.randint(len(alpha), size=aln.get_aligmentlength())]
                  - for seq in aln: seq[seq=='-'] = rseq[seq=='-']
        4) make and split tree at branches > 0.5
        5) for each subtree (ideally only one big tree), define new gene cluster and run
           maketree_align from standard step 6
    """

    geneCluster_fasta_path = ''.join([path,'geneCluster/'])
    new_split_folder= ''.join([geneCluster_fasta_path,'update_long_branch_splits/'])
    if os.path.exists(new_split_folder):
        ## remove the folder from previous run
        os.system(''.join(['rm -r ',new_split_folder]))
    os.system(''.join(['mkdir ',new_split_folder]))

    deleted_clusters_folder=''.join([geneCluster_fasta_path,'deleted_clusters_peaks_splits/'])
    if os.path.exists(deleted_clusters_folder):
        os.system(''.join(['rm -r ',deleted_clusters_folder]))
    os.system(''.join(['mkdir ',deleted_clusters_folder]))

    ## load clusters
    ClusterPath='%s%s'%(path,'protein_faa/diamond_matches/')
    geneCluster_dt=load_pickle(ClusterPath+'allclusters_postprocessed.cpk')

    ## merge unclustered genes
    merged_clusters_dict=defaultdict(list)
    merged_clusters_dict=find_and_merge_unclustered_genes(path, nstrains, window_size_smoothed, strain_proportion , sigma_scale)

    if len(merged_clusters_dict)!=0:
        ## there are merged clusters corresponding to the cluster peaks

        ## ensure that writing to new_clusters_longSplit starts at the beginning (for re-running)
        if os.path.exists(''.join([geneCluster_fasta_path,'new_clusters_longSplit.txt'])):
            os.system(''.join(['rm ',geneCluster_fasta_path,'new_clusters_longSplit.txt']))
        if os.path.exists(''.join([geneCluster_fasta_path,'old_clusters_longSplit.txt'])):
            os.system(''.join(['rm ',geneCluster_fasta_path,'old_clusters_longSplit.txt']))

        cut_branch_threshold=split_long_branch_cutoff#0.3
        ## cut tree and make new clusters
        cut_all_trees_from_merged_clusters(parallel, path, cut_branch_threshold, simple_tree)

        ## update clusters in allclusters_final.cpk
        #os.system('cp %sallclusters_final.cpk %s/allclusters_final.cpk.bk '%(ClusterPath,ClusterPath))

        ## delete old clusters
        delete_old_merged_clusters(geneCluster_fasta_path, geneCluster_dt, merged_clusters_dict)
        ## add newly split clusters
        update_geneCluster_dt(path,geneCluster_dt)
        ## write updated gene clusters in cpk file
        update_geneCluster_cpk(path,geneCluster_dt)
        ## write gene_diversity_Dt cpk file
        update_diversity_cpk(path)
def postprocess_unclustered_genes(parallel,
                                  path,
                                  nstrains,
                                  simple_tree,
                                  split_long_branch_cutoff,
                                  window_size_smoothed=5,
                                  strain_proportion=0.3,
                                  sigma_scale=3):
    """
        1) detect suspicious peaks in the distribution of average length of genes in gene cluster (aa count)
           np.bincount([1,2,3,34,3]) -> how often each entry is found
           np.convolve([1,1,1,1,1], gene_length_count)
          ->  unclustered genes will contribute many small clusters (size 1)
              that result in peaks in the distribution
        2) for each peak detected, align the sequences of all genes in clusters in peak
        3) to cluster aligned genes, build tree. However, to ensure long branches
          ->  between unaligned sub-alignment, fill gaps with random sequence (skipped, not tested)
              importantly, this random sequence needs to be the same in different columns of the alignment.
                  - random sequence a la rseq = [alpha[ii] for ii in np.random.randint(len(alpha), size=aln.get_aligmentlength())]
                  - for seq in aln: seq[seq=='-'] = rseq[seq=='-']
        4) make and split tree at branches > 0.5
        5) for each subtree (ideally only one big tree), define new gene cluster and run
           maketree_align from standard step 6
    """

    geneCluster_fasta_path = ''.join([path, 'geneCluster/'])
    new_split_folder = ''.join(
        [geneCluster_fasta_path, 'update_long_branch_splits/'])
    if os.path.exists(new_split_folder):
        ## remove the folder from previous run
        os.system(''.join(['rm -r ', new_split_folder]))
    os.system(''.join(['mkdir ', new_split_folder]))

    deleted_clusters_folder = ''.join(
        [geneCluster_fasta_path, 'deleted_clusters_peaks_splits/'])
    if os.path.exists(deleted_clusters_folder):
        os.system(''.join(['rm -r ', deleted_clusters_folder]))
    os.system(''.join(['mkdir ', deleted_clusters_folder]))

    ## load clusters
    ClusterPath = '%s%s' % (path, 'protein_faa/diamond_matches/')
    geneCluster_dt = load_pickle(ClusterPath + 'allclusters_postprocessed.cpk')

    ## merge unclustered genes
    merged_clusters_dict = defaultdict(list)
    merged_clusters_dict = find_and_merge_unclustered_genes(
        path, nstrains, window_size_smoothed, strain_proportion, sigma_scale)

    if len(merged_clusters_dict) != 0:
        ## there are merged clusters corresponding to the cluster peaks

        ## ensure that writing to new_clusters_longSplit starts at the beginning (for re-running)
        if os.path.exists(''.join(
            [geneCluster_fasta_path, 'new_clusters_longSplit.txt'])):
            os.system(''.join(
                ['rm ', geneCluster_fasta_path, 'new_clusters_longSplit.txt']))
        if os.path.exists(''.join(
            [geneCluster_fasta_path, 'old_clusters_longSplit.txt'])):
            os.system(''.join(
                ['rm ', geneCluster_fasta_path, 'old_clusters_longSplit.txt']))

        cut_branch_threshold = split_long_branch_cutoff  #0.3
        ## cut tree and make new clusters
        cut_all_trees_from_merged_clusters(parallel, path,
                                           cut_branch_threshold, simple_tree)

        ## update clusters in allclusters_final.cpk
        #os.system('cp %sallclusters_final.cpk %s/allclusters_final.cpk.bk '%(ClusterPath,ClusterPath))

        ## delete old clusters
        delete_old_merged_clusters(geneCluster_fasta_path, geneCluster_dt,
                                   merged_clusters_dict)
        ## add newly split clusters
        update_geneCluster_dt(path, geneCluster_dt)
        ## write updated gene clusters in cpk file
        update_geneCluster_cpk(path, geneCluster_dt)
        ## write gene_diversity_Dt cpk file
        update_diversity_cpk(path)
def postprocess_split_long_branch(parallel, path, simple_tree, cut_branch_threshold=0.3):
    """
    Split tree via breaking up long branches.
    Remote homology leads to over-clustering. This yields tree with long branches.
    """

    file_path = ''.join([path,'geneCluster/'])
    new_split_folder= ''.join([file_path,'update_long_branch_splits/'])
    if os.path.exists(new_split_folder):
        ## remove the folder from previous run
        os.system(''.join(['rm -r ',new_split_folder]))
    os.system(''.join(['mkdir ',new_split_folder]))
    deleted_clusters_folder=''.join([file_path,'deleted_clusters_longSplit/'])
    if os.path.exists(deleted_clusters_folder):
        os.system(''.join(['rm -r ',deleted_clusters_folder]))
    os.system(''.join(['mkdir ',deleted_clusters_folder]))

    ## load clusters
    cluster_path='%s%s'%(path,'protein_faa/diamond_matches/')
    geneCluster_dt=load_pickle(cluster_path+'allclusters.cpk')

    ## gather all trees generated before postprocessing
    tree_path = file_path
    tree_fname_list =glob.glob(tree_path+'*nwk')

    ## ensure that writing to new_clusters_longSplit starts at the beginning (for re-running)
    if os.path.exists(''.join([file_path,'new_clusters_longSplit.txt'])):
        os.system(''.join(['rm ',file_path,'new_clusters_longSplit.txt']))
    if os.path.exists(''.join([file_path,'old_clusters_longSplit.txt'])):
        os.system(''.join(['rm ',file_path,'old_clusters_longSplit.txt']))

    # =============================================
    # parallelization:
    # "post-clustering workflow for splitting trees on over-clustered records"
    treefile_used=True
    multips(cutTree_outputCluster, parallel, tree_fname_list, file_path, cut_branch_threshold, treefile_used)

    ## If new_clusters_longSplit.txt (over_split records) exists,
    ## then gather new clusters from new_clusters_longSplit.txt
    if os.path.exists(''.join([file_path,'new_clusters_longSplit.txt'])):
        with open(file_path+'new_clusters_longSplit.txt', 'rb') as new_clusters_longSplit:
            new_fa_files_list=[ clus.rstrip() for clus in new_clusters_longSplit ]
            print '#times of splitting long branches:',len(new_fa_files_list)-1
        with open(file_path+'old_clusters_longSplit.txt', 'rb') as delete_cluster_file:
            deleted_file_count=len([ clus for clus in delete_cluster_file ])
            print '#clusters split during the checking of long branches:',deleted_file_count

        ## parallelization of "align and make tree on new cluster"
        multips(align_and_makeTree, parallel, new_fa_files_list, file_path, simple_tree)
        # =============================================

        ## delete original clusters which are split
        delete_original_clusters(file_path, geneCluster_dt)
        ## add newly split clusters
        update_geneCluster_dt(path,geneCluster_dt)
        ## write updated gene clusters in cpk file
        update_geneCluster_cpk(path, geneCluster_dt)
        ## write gene_diversity_Dt cpk file
        update_diversity_cpk(path)

        os.system(' '.join(['mv ',file_path+'new_clusters_longSplit.txt' ,file_path+'added_clusters_split_long.txt' ]))
        os.system(' '.join(['mv ',file_path+'old_clusters_longSplit.txt', file_path+'deleted_clusters_split_long.txt']))
    else: # no clusters postprocessed
        os.system(' '.join(['cp',cluster_path+'allclusters.cpk',cluster_path+'allclusters_postprocessed.cpk']))
Exemplo n.º 29
0
def estimate_core_gene_diversity(path, folders_dict, strain_list, parallel, core_cutoff, factor_core_diversity, species):
    """
    estimate core gene diversity before gene cluster alignment
    and cluster post-processing
    """
    totalStrain= len(strain_list)

    ## load clusters
    clustering_path= folders_dict['clustering_path']
    geneCluster_dt= load_pickle(clustering_path+'allclusters.cpk')
    protein_path= folders_dict['protein_path']
    nucleotide_path= folders_dict['nucleotide_path']
    protein_dict_path= '%s%s'%(protein_path,'all_protein_seq.cpk')
    nucleotide_dict_path= '%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk')
    tmp_core_seq_path= '%s%s'%(clustering_path,'tmp_core/')
    ## load geneID_to_geneSeqID geneSeqID cpk file
    geneID_to_geneSeqID_dict= load_pickle(path+'geneID_to_geneSeqID.cpk')

    ## create core gene list
    core_geneCluster_dt= defaultdict()
    # geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes }
    for clusterID, cluster_stats in geneCluster_dt.iteritems():
        if core_cutoff==1.0:
            strain_core_cutoff=totalStrain
        else:
            strain_core_cutoff=int(totalStrain*core_cutoff)
        ## check whether #genes == #strains and it's a core/soft-core gene
        if cluster_stats[0]==cluster_stats[2] and cluster_stats[0]>=strain_core_cutoff:
            core_geneCluster_dt[clusterID]=cluster_stats
    if os.path.exists(tmp_core_seq_path):
        os.system(''.join(['rm -rf ',tmp_core_seq_path]))
    os.system('mkdir %s'%tmp_core_seq_path)

    ## create dict storing all genes' translation
    if 0:
        gene_aa_dict= defaultdict(dict)
        for accession_id in strain_list:
            gene_aa_dict[accession_id]= read_fasta(''.join([protein_path,accession_id,'.faa']))
        write_pickle(protein_dict_path, gene_aa_dict)

        ## create dict for all gene's nucleotide sequence
        gene_na_dict= defaultdict(dict)
        for accession_id in strain_list:
            gene_na_dict[accession_id]=read_fasta(''.join([nucleotide_path,accession_id,'.fna']))
        write_pickle(nucleotide_dict_path, gene_na_dict)

    gene_aa_dict= load_pickle(protein_dict_path)
    gene_na_dict= load_pickle(nucleotide_dict_path)

    ## write nucleotide and amino-acid sequences for each gene cluster
    export_cluster_seq_tmp(tmp_core_seq_path, core_geneCluster_dt, geneID_to_geneSeqID_dict,
        gene_na_dict, gene_aa_dict)

    tmp_fa_files=glob.glob(tmp_core_seq_path+"*.fna")
    multips(calculate_diversity, parallel, tmp_fa_files, tmp_core_seq_path, species)

    calculated_core_diversity=tmp_average_core_diversity(tmp_core_seq_path)
    refined_core_diversity= round((0.1+factor_core_diversity*calculated_core_diversity)/(1+factor_core_diversity*calculated_core_diversity),4)
    print('factor used: '+str(factor_core_diversity))
    print('average core genome diversity: '+str(calculated_core_diversity))
    print('defined core genome diversity cutoff for splitting long branches: '+str(refined_core_diversity))

    ## move folder tmp_core to the central data folder
    new_clustering_path= '%stmp_core'%path
    if os.path.exists(new_clustering_path):
        os.system(''.join(['rm -r ',new_clustering_path]))
    os.system('mv %s %s'%(tmp_core_seq_path, path))
    return calculated_core_diversity, refined_core_diversity
def geneCluster_to_json(path, enable_RNA_clustering, store_locus_tag,
                        raw_locus_tag, optional_table_column):
    """
    create json file for gene cluster table visualzition
    input:  path to genecluster output
    output: geneCluster.json
    """
    # define path and make output directory
    geneCluster_path='%s%s'%(path,'geneCluster/')
    output_path='%s%s'%(path,'vis/')

    # open files
    geneClusterJSON_outfile=open(output_path+'geneCluster.json', 'wb')
    ##store locus_tags in a separate file for large dataset
    if store_locus_tag:
        locus_tag_outfile=open(path+'search_locus_tag.tsv', 'wb')


    ### load precomputed annotations, diversity, associations etc
    # load geneID_to_descriptions
    geneID_to_descriptions=load_pickle(path+'geneID_to_description.cpk')

    if enable_RNA_clustering:
        # load RNAID_to_description_file
        geneID_to_descriptions.update(load_pickle(path+'RNAID_to_description.cpk'))

    gene_diversity_Dt = load_pickle(geneCluster_path+'gene_diversity.cpk')
    ## load gain/loss event count dictionary
    dt_geneEvents = load_pickle(geneCluster_path+'dt_geneEvents.cpk')
    ## load association
    branch_associations_path = path+'branch_association.cpk'
    if os.path.isfile(branch_associations_path):
        branch_associations = load_pickle(branch_associations_path)
    else:
        branch_associations={}
    presence_absence_associations_path = path+'presence_absence_association.cpk'
    if os.path.isfile(presence_absence_associations_path):
        presence_absence_associations = load_pickle(presence_absence_associations_path)
    else:
        presence_absence_associations={}

    ## load list of clustered sorted by strain count
    sorted_genelist = load_sorted_clusters(path)

    geneClusterJSON_outfile.write('[')
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for gid, (clusterID, gene) in enumerate(sorted_genelist):
        strain_count, gene_list, gene_count = gene
        # #print strain_count, gene_count
        if gid!=0: ## begin
            geneClusterJSON_outfile.write(',\n')

        ## annotation majority
        allAnn, majority_annotation = consolidate_annotation(path, gene_list, geneID_to_descriptions)

        ## geneName majority
        all_geneName, majority_geneName =  consolidate_geneName(path, gene_list, geneID_to_descriptions)

        ## extract gain/loss event count
        gene_event= dt_geneEvents[gid]

        ## average length
        seqs = read_fasta(geneCluster_path+'%s%s'%(clusterID,'.fna')).values()
        geneClusterLength = int(np.mean([ len(igene) for igene in seqs]))

        ## msa
        #geneCluster_aln='%s%s'%(clusterID,'_aa.aln')
        geneCluster_aln=clusterID

        ## check for duplicates
        if gene_count>strain_count:
            duplicated_state='yes'
            dup_list=[ ig.split('|')[0] for ig in gene_list]
            # "#" to delimit (gene/gene_count)key/value ; "@" to seperate genes
            # Counter({'g1': 2, 'g2': 1})
            dup_detail=''.join(['%s#%s@'%(kd,vd) for kd, vd in Counter(dup_list).iteritems() if vd>1 ])[:-1]
        else:
            duplicated_state='no';dup_detail=''

        ## locus_tag
        if raw_locus_tag: # make a string of all locus tags [1] in igl.split('|')
            all_locus_tags=' '.join([ igl.split('|')[1] for igl in gene_list ])
        else: # in addition to locus tag, keep strain name (but replace '|')
            all_locus_tags=' '.join([ igl.replace('|','_') for igl in gene_list ])

        ## optionally store locus tags to file, remove from geneClusterJSON
        if store_locus_tag:
            locus_tag_outfile.write('%s\t%s\n'%(clusterID,all_locus_tags))
            all_locus_tags=''

        ## default cluster json fields
        cluster_json_line=['"geneId":'+str(gid+1),
                            '"geneLen":'+str(geneClusterLength),
                            '"count":'+str(strain_count),
                            '"dupli":"'+duplicated_state+'"',
                            '"dup_detail":"'+dup_detail+'"',
                            '"ann":"'+majority_annotation+'"',
                            '"msa":"'+geneCluster_aln+'"',
                            '"divers":"'+gene_diversity_Dt[clusterID]+'"',
                            '"event":"'+str(gene_event)+'"',
                            '"allAnn":"'+allAnn+'"',
                            '"GName":"'+majority_geneName+'"',
                            '"allGName":"'+all_geneName+'"',
                            '"locus":"'+all_locus_tags+'"'
                            ]

        if optional_table_column:
            cluster_json_line.extend(optional_geneCluster_properties(gene_list,optional_table_column))
        if clusterID in branch_associations:
            cluster_json_line.extend(geneCluster_associations(branch_associations[clusterID], suffix='BA'))
        if clusterID in presence_absence_associations:
            cluster_json_line.extend(geneCluster_associations(presence_absence_associations[clusterID], suffix='PA'))

        #write file
        cluster_json_line=','.join(cluster_json_line)
        geneClusterJSON_outfile.write('{'+cluster_json_line+'}')

    # close files
    geneClusterJSON_outfile.write(']')
    geneClusterJSON_outfile.close()
    if store_locus_tag: locus_tag_outfile.close()