Exemplos de load_pickle em Python, exemplos de SF00_miscellaneous.load_pickle em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: SF06_3_clusterRNA.py Projeto: thackl/pan-genome-analysis

def RNAclusters_align_makeTree(path, parallel):
    """
    create RNA clusters as nucleotide fasta files
    and build individual RNA trees based on fna files
    """

    diamond_RNACluster_dt = create_RNACluster_fa(path)

    ## align, build_tree, make_RNATree_json
    fasta_path = path + 'geneCluster/'
    fa_files = glob.glob(fasta_path + "*RNA*.fna")
    multips(single_RNACluster_align_and_makeTree, fasta_path, parallel,
            fa_files)
    ## add RNA cluster in diamond_geneCluster_dt
    ### load gene cluster
    geneClusterPath = '%s%s' % (path, 'protein_faa/diamond_matches/')
    os.system(
        'cp %sorthamcl-allclusters_final.cpk %s/orthamcl-allclusters_final.cpk.bk '
        % (geneClusterPath, geneClusterPath))
    diamond_geneCluster_dt = load_pickle(geneClusterPath +
                                         'orthamcl-allclusters_final.cpk')
    ### update gene cluster with RNA cluster
    update_gene_cluster_with_RNA(path, diamond_RNACluster_dt,
                                 diamond_geneCluster_dt)
    ### update diversity file
    update_diversity_cpk_file(path)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: SF06_geneCluster_align_makeTree.py Projeto: thackl/pan-genome-analysis

def postprocess_paralogs_iterative(parallel, path, nstrains,
                         branch_length_cutoff=500, paralog_cutoff=0.5, plot=False):

    cluster_path= path+'protein_faa/diamond_matches/'
    diamond_geneCluster_dt=load_pickle(cluster_path+'orthamcl-allclusters.cpk')

    split_result= postprocess_paralogs( parallel, path, nstrains,
                                            diamond_geneCluster_dt,
                                            set(),
                                            branch_length_cutoff=branch_length_cutoff,
                                            paralog_cutoff=paralog_cutoff,
                                            plot=plot)
    n_split_clusters, new_fa_files_set = split_result
    iteration=0
    while(n_split_clusters):
        print('---- split a total of ',n_split_clusters, 'in iteration', iteration)
        split_result= postprocess_paralogs( parallel, path, nstrains,
                                                diamond_geneCluster_dt,
                                                new_fa_files_set,
                                                branch_length_cutoff=branch_length_cutoff,
                                                paralog_cutoff=paralog_cutoff,
                                                plot=plot)
        n_split_clusters, new_fa_files_set = split_result
        iteration+=1

    
    # output_path = path+'geneCluster/'
    # with open(output_path+'gene_diversity.txt', 'rb') as infile:
    #     write_pickle(output_path+'gene_diversity.cpk',{ i.rstrip().split('\t')[0]:i.rstrip().split('\t')[1] for i in infile})
    
    ## write gene_diversity_Dt cpk file
    update_diversity_cpk_file(path)

    ## remove old gene cluster and create new split cluster
    update_gene_cluster(path, diamond_geneCluster_dt )

Exemplo n.º 3

0

Exibir arquivo

def make_genepresence_alignment(path):
    '''
    loop over all gene clusters and append 0/1 to strain specific
    string used as pseudo alignment of gene presence absence
    '''
    geneClusterPath = '%s%s' % (path, 'protein_fna/diamond_matches/')
    output_path = '%s%s' % (path, 'geneCluster/')

    ## load strain list and prepare for gene presence/absence
    strain_list = load_pickle(path + 'strain_list.cpk')
    set_totalStrain = set([istrain for istrain in strain_list])
    totalStrain = len(set_totalStrain)
    dt_strainGene = defaultdict(list)

    sorted_genelist = load_sorted_clusters(path)
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for gid, (clusterID, gene) in enumerate(sorted_genelist):
        gene_list = gene[1]
        ## append 0/1 to each strain
        dt_strainGene = create_genePresence(dt_strainGene, totalStrain,
                                            set_totalStrain, gene_list)

    with open(output_path + 'genePresence.aln', 'wb') as presence_outfile:
        for istkey in dt_strainGene:
            dt_strainGene[istkey] = ''.join(dt_strainGene[istkey])
            write_in_fa(presence_outfile, istkey, dt_strainGene[istkey])

    write_pickle(output_path + 'dt_genePresence.cpk', dt_strainGene)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: SF06_geneCluster_align_makeTree.py Projeto: thackl/pan-genome-analysis

    def create_geneCluster_fa():
        """ dict storing amino_acid Id/Seq from '.faa' files
            input: '.faa', '_gene_nuc_dict.cpk', '-orthamcl-allclusters.cpk'
            output:
        """
        ## make sure the geneCluster folder is empty
        if os.path.isdir(path+'geneCluster/')==True:
            print 'remove previous folder: ',path+'geneCluster/'
            os.system('rm -rf %s'%(path+'geneCluster/'))

        faa_path=path+'protein_faa/'
        ## dict storing all genes' translation
        gene_aa_dict=defaultdict(list)
        for ifaa in glob.glob(faa_path+"*.faa"):
            gene_aa_dict.update(read_fasta(ifaa))

        ## dict storing nucleotide Id/Seq from '_gene_nuc_dict.cpk' files
        istrain_cpk={}; strain_list= load_pickle(path+'strain_list.cpk');
        nucleotide_dict_path= '%s%s'%(path,'nucleotide_fna/')
        for istrain in strain_list:
            istrain_cpk[istrain]=load_pickle(nucleotide_dict_path+istrain+'_gene_nuc_dict.cpk')

        ## load gene cluster cpk file
        geneCluster_path=faa_path+'diamond_matches/'
        diamond_geneCluster_dt=load_pickle(geneCluster_path+'orthamcl-allclusters.cpk')

        ## load geneID_to_geneSeqID geneSeqID cpk file
        geneID_to_geneSeqID_dict=load_pickle(path+'geneID_to_geneSeqID.cpk')

        ## create cluster-genes fasta files
        fasta_path=path+'geneCluster/'; os.system('mkdir '+fasta_path)
        ## diamond_geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes }
        for clusterID, gene in diamond_geneCluster_dt.iteritems():
            ## geneCluster file name
            gene_cluster_nu_filename="%s%s"%(clusterID,'.fna')
            gene_cluster_aa_filename="%s%s"%(clusterID,'.faa')
            gene_cluster_nu_write=open( fasta_path+gene_cluster_nu_filename, 'wb')
            gene_cluster_aa_write=open( fasta_path+gene_cluster_aa_filename, 'wb')
            ## write nucleotide/amino_acid sequences into geneCluster files
            for gene_memb in gene[1]:
                ## gene_name format: strain_1|locusTag
                strain_name= gene_memb.split('|')[0]
                gene_memb_seq=str(istrain_cpk[strain_name][gene_memb])
                geneSeqID=geneID_to_geneSeqID_dict[gene_memb]
                write_in_fa(gene_cluster_nu_write, geneSeqID, gene_memb_seq )
                write_in_fa(gene_cluster_aa_write,geneSeqID, gene_aa_dict[gene_memb])
            gene_cluster_nu_write.close(); gene_cluster_aa_write.close();

Exemplo n.º 5

0

Exibir arquivo

Arquivo: SF06_3_clusterRNA.py Projeto: thackl/pan-genome-analysis

def create_RNACluster_fa(path):
    """
        input: '.fna', '_RNA_nuc_dict.cpk', '-orthamcl-allclusters.cpk'
        output: '.aln', 'tree.json', etc
    """
    if 0:
        ## make sure the RNACluster folder is empty
        if os.path.isdir(path + 'RNACluster/') == True:
            print 'remove previous folder: ', path + 'RNACluster/'
            os.system('rm -rf %s' % (path + 'RNACluster/'))

    ## dict storing nucleotide Id/Seq from '_RNA_nuc_dict.cpk' files
    istrain_cpk = {}
    strain_list = load_pickle(path + 'strain_list.cpk')
    nucleotide_dict_path = '%s%s' % (path, 'nucleotide_fna/')
    for istrain in strain_list:
        istrain_cpk[istrain] = load_pickle(nucleotide_dict_path + istrain +
                                           '_RNA_nuc_dict.cpk')

    ## load RNA cluster cpk file
    RNACluster_path = path + 'RNA_fna/'
    diamond_RNACluster_dt = load_pickle(RNACluster_path +
                                        'orthamcl-allclusters.cpk')

    ## load RNAID_to_RNASeqID RNASeqID cpk file
    RNAID_to_RNASeqID_dict = load_pickle(path + 'RNAID_to_SeqID.cpk')

    ## create cluster-RNAs fasta files (By default: put RNAs in geneCluster folder)
    fasta_path = path + 'geneCluster/'
    ## diamond_RNACluster_dt: {clusterID:[ count_strains,[memb1,...],count_RNAs }
    for clusterID, RNA in diamond_RNACluster_dt.iteritems():
        ## RNACluster file name
        RNA_cluster_nu_filename = "%s%s" % (clusterID, '.fna')
        RNA_cluster_nu_write = open(fasta_path + RNA_cluster_nu_filename, 'wb')
        ## write nucleotide/amino_acid sequences into RNACluster files
        for RNA_memb in RNA[1]:
            ## RNA_name format: strain_1|locusTag
            strain_name = RNA_memb.split('|')[0]
            RNA_memb_seq = str(istrain_cpk[strain_name][RNA_memb])
            RNASeqID = RNAID_to_RNASeqID_dict[RNA_memb]
            write_in_fa(RNA_cluster_nu_write, RNASeqID, RNA_memb_seq)
        RNA_cluster_nu_write.close()
    return diamond_RNACluster_dt

Exemplo n.º 6

0

Exibir arquivo

Arquivo: SF11_tree_metadata_export.py Projeto: thackl/pan-genome-analysis

def json_parser(path, species, meta_info_file_path):
    """ create json file for web-visualiaztion
        input: tree_result.newick, *metainfo_curated.tsv
        output: json files for gene cluster table and core gene SNP tree
    """
    from ete2 import Tree
    metaFile = path + 'metainfo_curated.tsv'
    if meta_info_file_path == 'none':
        metaFile = path + 'metainfo_curated.tsv'
    else:  ## create a link of user meta_info_file
        os.system('pwd')
        os.system('cp %s %s' % (meta_info_file_path, metaFile))

    output_path = '%s%s' % (path, 'geneCluster/')
    visualzition_path = '%s%s' % (path, 'Vis/')
    tree = Tree(output_path + 'tree_result.newick', format=1)
    dt_genePresence = load_pickle(path + 'geneCluster/dt_genePresence.cpk')
    ## create tree json files
    jsonString = json.dumps(
        create_json_addLabel(species, dt_genePresence, tree, 0, path,
                             metaFile))
    jsonString1 = json.dumps(
        create_json_addLabel(species, dt_genePresence, tree, 1, path,
                             metaFile))
    os.chdir(output_path)
    with open('coreGenomeTree.json', 'wb') as write_json:
        write_json.write(jsonString)
    with open('coreGenomeTree-noBranch.json', 'wb') as write_json1:
        write_json1.write(jsonString1)

    ## create tnt-nodeAttri-dataTable.json and tnt-nodeAttri.json for tree tables
    json_tnt_parser()

    ## move all *.cpk file to ./data/YourSpecies/ folder
    ##      coreGenomeTree.json and strainMetainfo.json file to ./data/YourSpecies/vis/ folder
    ##      GC*json file to ./data/YourSpecies/vis/geneCluster/ folder
    current_path = os.getcwd()
    os.system('ln -sf %s/*.cpk %s/../' % (current_path, current_path))
    os.system(
        'mv coreGenomeTree.json strainMetainfo.json geneGainLossEvent.json ../vis/;'
    )
    os.system('mv GC*.aln GC*_tree.json ../vis/geneCluster/;')
    print(
        'Pan-genome analysis is finished, your data can be transfered to the local server for data visualization and exploration via link-to-server.py in the main folder.'
    )

Exemplo n.º 7

0

Exibir arquivo

Arquivo: SF06_2_unclustered_genes.py Projeto: thackl/pan-genome-analysis

def postprocess_unclustered_genes(parallel,
                                  path,
                                  nstrains,
                                  window_size_smoothed=5,
                                  strain_proportion=0.3,
                                  sigma_scale=3):

    # 1) detect suspicious peaks in the distribution of average length of genes in gene cluster (aa count)
    #    np.bincount([1,2,3,34,3]) -> how often each entry is found
    #    np.convolve([1,1,1,1,1], gene_length_count)
    #   ->  unclustered genes will contribute many small clusters (size 1)
    #       that result in peaks in the distribution
    # 2) for each peak detected, align the sequences of all genes in clusters in peak
    # 3) to cluster aligned genes, build tree. However, to ensure long branches
    #   ->  between unaligned sub-alignment, fill gaps with random sequence (skipped, not tested)
    #       importantly, this random sequence needs to be the same in different columns of the alignment.
    #           - random sequence a la rseq = [alpha[ii] for ii in np.random.randint(len(alpha), size=aln.get_aligmentlength())]
    #           - for seq in aln: seq[seq=='-'] = rseq[seq=='-']
    # 4) make and split tree at branches >.5
    # 5) for each subtree (ideally only one big tree), define new gene cluster and run
    #    maketree_align from standard step 6

    ## load clusters
    geneClusterPath = '%s%s' % (path, 'protein_faa/diamond_matches/')
    diamond_geneCluster_dt = load_pickle(geneClusterPath +
                                         'orthamcl-allclusters_final.cpk')
    ## merge unclustered genes
    merged_clusters_dict = defaultdict(list)
    merged_clusters_dict = find_and_merge_unclustered_genes(
        path, nstrains, window_size_smoothed, strain_proportion, sigma_scale)
    ## cut tree and make new clusters
    cut_tree_from_merged_clusters(parallel, path, diamond_geneCluster_dt,
                                  merged_clusters_dict)
    ## write new clusters in orthamcl-allclusters_final.cpk
    os.system(
        'cp %sorthamcl-allclusters_final.cpk %s/orthamcl-allclusters_final.cpk.bk '
        % (geneClusterPath, geneClusterPath))
    update_gene_cluster(path, diamond_geneCluster_dt)
    ## write gene_diversity_Dt cpk file
    update_diversity_cpk_file(path)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: SF06_geneCluster_align_makeTree.py Projeto: thackl/pan-genome-analysis

def load_sorted_clusters(path):
    '''
    load gene clusters and sort 1st by abundance and then by clusterID
    '''
    geneClusterPath='%s%s'%(path,'protein_faa/diamond_matches/')
    diamond_geneCluster_dt=load_pickle(geneClusterPath+'orthamcl-allclusters_final.cpk')
    from operator import itemgetter
    # sort by decreasing abundance (-v[0], minus to achieve decreasing)
    # followed by increasing clusterID GC_00001
    return sorted(diamond_geneCluster_dt.iteritems(),
                   key=lambda (k,v): (-itemgetter(0)(v),k), reverse=False)

#=============================================#
# postprocessing unclustered genes (peaks)     
#=============================================#

# #from SF06_2_unclustered_genes import find_and_merge_unclustered_genes
# from SF06_2_unclustered_genes import find_and_merge_unclustered_genes, cut_tree_from_merged_clusters

# def postprocess_unclustered_genes(n_threads, path, nstrains, window_size=5, strain_proportion=0.3 , sigma_scale=3):
#     diamond_geneCluster_dt=load_pickle(geneClusterPath+'orthamcl-allclusters_final.cpk')
#     find_and_merge_unclustered_genes(n_threads, path, nstrains, window_size, strain_proportion , sigma_scale)
#     cut_tree_from_merged_clusters(path)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: run-pipeline.py Projeto: thackl/pan-genome-analysis

print path
species=strain_list.split('-RefSeq')[0]

def load_strains():
    """ load input strains in strain_list """
    if os.path.isfile(path+strain_list):
        with open(path+strain_list,'rb') as infile:
            write_pickle(path+'strain_list.cpk', [ ist.rstrip().split('.gbk')[0] for ist in infile] )

if 1 in params.steps: #step 01:
    load_strains()
    print 'step01-refSeq strain list successfully found.'

## load strain_list.cpk file and give the total number of strains
if os.path.isfile(path+'strain_list.cpk'):
    strain_lst= load_pickle(path+'strain_list.cpk')
nstrains =len([ istrain for istrain in strain_lst ])

if 2 in params.steps:# step02:
    start = time.time()
    accessionID_single(path, strain_lst)
    print 'step02-download NCBI refseq GenBank file from strain list:'
    print times(start)

if 3 in params.steps:# step03:
    start = time.time()
    diamond_input(path, strain_lst, params.disable_RNA_clustering)
    print 'step03-create input file for Diamond from genBank file (.gb):'
    print times(start)

if 4 in params.steps:# step04:

Exemplo n.º 10

0

Exibir arquivo

Arquivo: SF07_core_SNP_matrix.py Projeto: thackl/pan-genome-analysis

def create_core_SNP_matrix(path):
    """ create SNP matrix using core gene SNPs
        input: strain_list.cpk, core_geneList.cpk
        output: SNP_whole_matrix.aln
    """
    import os, sys, operator
    import numpy as np
    from collections import defaultdict
    from SF00_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa

    alnFilePath = '%s%s' % (path, 'geneCluster/')
    output_path = alnFilePath

    ## create core gene list
    corelist = []
    totalStrain = len(load_pickle(path + 'strain_list.cpk'))
    sorted_geneList = load_sorted_clusters(path)
    with open(output_path + 'core_geneList.txt', 'wb') as outfile:
        for clusterID, vg in sorted_geneList:
            if vg[0] == totalStrain and vg[2] == totalStrain:
                coreGeneName = '%s%s' % (clusterID, '_na.aln')
                ## sequences might be discarded because of premature stops
                coreGeneName_path = alnFilePath + coreGeneName
                if os.path.exists(coreGeneName_path) and len(
                        read_fasta(coreGeneName_path)) == totalStrain:
                    outfile.write(coreGeneName + '\n')
                    corelist.append(coreGeneName)
                else:
                    print '%s%s%s' % ('warning: ', coreGeneName_path,
                                      ' is not a core gene')
        write_pickle(output_path + 'core_geneList.cpk', corelist)

    refSeqList = load_pickle(path + 'strain_list.cpk')
    refSeqList.sort()

    snp_fre_lst = []
    snp_wh_matrix_flag = 0
    snp_pos_dt = defaultdict(list)
    snp_whole_matrix = np.array([])

    snps_by_gene = []
    for align_file in corelist:  ## all core genes
        fa_dt = read_fasta(alnFilePath + align_file)
        fa_sorted_lst = sorted(fa_dt.items(), key=lambda x: x[0].split('|')[0])
        nuc_array = np.array([])
        flag = 0
        for ka, va in enumerate(fa_sorted_lst):
            if flag == 0:
                flag = 1
                nuc_array = np.array(np.fromstring(va[1], dtype='S1'))
            else:
                nuc_array = np.vstack(
                    (nuc_array, np.fromstring(va[1], dtype='S1')))

        position_polymorphic = np.where(
            np.all(nuc_array == nuc_array[0, :], axis=0) == False)[0]
        position_has_gap = np.where(np.any(nuc_array == '-', axis=0))[0]
        position_SNP = np.setdiff1d(position_polymorphic, position_has_gap)
        snp_columns = nuc_array[:, position_SNP]
        snp_pos_dt[align_file] = position_SNP

        if snp_wh_matrix_flag == 0:
            snp_whole_matrix = snp_columns
            snp_wh_matrix_flag = 1
        else:
            snp_whole_matrix = np.hstack((snp_whole_matrix, snp_columns))

    write_pickle(output_path + 'snp_pos.cpk', snp_pos_dt)

    with open(output_path + 'SNP_whole_matrix.aln', 'wb') as outfile:
        for ind, isw in enumerate(snp_whole_matrix):
            write_in_fa(outfile, refSeqList[ind], isw.tostring())

Exemplo n.º 11

0

Exibir arquivo

def gbk_To_Metainfo(path):
    """
    extract metainfo (date/country) from genBank file
    This step is not necessary if the user provides a tab-delimited
    meta-information table as path/"metainfo_curated.tsv"
    Input: genBank file
    Output: metainfo csv file
    """
    import os, sys
    from Bio import SeqIO
    from SF00_miscellaneous import load_pickle
    each_gbk_path = '%s%s' % (path, 'input_GenBank/')
    strainList = load_pickle(path + 'strain_list.cpk')
    writeseq = open(path + 'metainfo.tsv', 'wb')
    # write the headers:
    # default: accName, strainName, antiBiotics, dateInfo, country, host
    writeseq.write("%s\n" % ('\t'.join(
        ['accName', 'strainName', 'collection_date', 'country', 'host'])))
    # check each genBank file to get meta-type
    for eachstrain in strainList:
        for index, record in enumerate(
                SeqIO.parse(open(each_gbk_path + eachstrain + '.gbk'),
                            "genbank")):
            for i, feature in enumerate(record.features):
                if feature.type == 'source':
                    host, datacolct, country, strainName = '', '', '', ''
                    if 'strain' in feature.qualifiers:
                        strainName = feature.qualifiers['strain'][0]
                    else:
                        strainName = 'unknown'
                    if 'host' in feature.qualifiers:
                        host = feature.qualifiers['host'][0]
                    else:
                        host = 'unknown'
                    if 'collection_date' in feature.qualifiers:
                        datacolct = feature.qualifiers['collection_date'][0]
                    if 'country' in feature.qualifiers:
                        country = feature.qualifiers['country'][0]
                        country = country.split(':')[0]  #USA: New...
                    else:
                        country = 'unknown'

                    # date processing
                    import re, calendar
                    datacolct = ''.join(datacolct.split('-'))
                    dates = re.findall('\d+', datacolct)
                    # two versions of date: 15-Seq-2011/2014-03-14
                    if sum([str.isalpha(ic) for ic in datacolct]) != 0:
                        month_abbr = re.findall('[a-zA-Z]+', datacolct)[0]
                        month = str(
                            list(calendar.month_abbr).index(month_abbr))
                        if len(datacolct) == 9:
                            if len(month) == 1: month = '0' + month
                            datacolct = dates[1] + '-' + month + '-' + dates[0]
                        else:
                            if len(month) == 1: month = '0' + month
                            datacolct = dates[
                                0] + '-' + month + '-01'  #artificial day 01
                    elif datacolct != '':
                        if len(datacolct) == 8:
                            datacolct = '%s-%s-%s' % (
                                dates[0][:4], dates[0][4:6], dates[0][6:])
                        elif len(datacolct) == 6:  #'2010-05'
                            datacolct = '%s-%s-01' % (dates[0][:4],
                                                      dates[0][4:6])
                        else:
                            datacolct = dates[0] + '-01-01'
                    elif datacolct == '':
                        datacolct = 'unknown'

                    # just get the year
                    datacolct = datacolct.split('-')[0]
                    # antibiotic default: unknown
                    # antibio='unknown'
                    break
            #writeseq.write( "%s\n"%('\t'.join([eachstrain, antibio, datacolct, country, host])) )
            writeseq.write("%s\n" % ('\t'.join(
                [eachstrain, strainName, datacolct, country, host])))
    writeseq.close()
    os.system('mv %smetainfo.tsv %smetainfo_curated.tsv' % (path, path))

Exemplo n.º 12

0

Exibir arquivo

def geneCluster_to_json(path, disable_RNA_clustering):
    """
    create json file for gene cluster table visualzition
    input:  path to genecluster output
    output: geneCluster.json
    """
    # load geneID_to_description_dict
    geneID_to_description_dict = load_pickle(path +
                                             'geneID_to_description.cpk')
    if disable_RNA_clustering == 0:
        # load RNAID_to_description_file
        geneID_to_description_dict.update(
            load_pickle(path + 'RNAID_to_description.cpk'))
    output_path = '%s%s' % (path, 'geneCluster/')
    visualzition_path = '%s%s' % (path, 'vis/')
    os.system('mkdir %s; mkdir %sgeneCluster/' %
              (visualzition_path, visualzition_path))
    write_file_lst_json = open(visualzition_path + 'geneCluster.json', 'wb')
    gene_diversity_Dt = load_pickle(output_path + 'gene_diversity.cpk')

    ## sorted clusters
    sorted_genelist = load_sorted_clusters(path)

    ## prepare geneId_Dt_to_locusTag

    #geneId_Dt_to_locusTag=defaultdict(list)
    #geneId_Dt_to_locusTag={v:k for k,v in locusTag_to_geneId_Dt.items()}

    ## load gain/loss event count dictionary
    dt_geneEvents = load_pickle(output_path + 'dt_geneEvents.cpk')

    write_file_lst_json.write('[')
    begin = 0
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for gid, (clusterID, gene) in enumerate(sorted_genelist):
        strain_count, gene_list, gene_count = gene
        if begin == 0:
            begin = 1
        else:
            write_file_lst_json.write(',\n')

        ## annotation majority
        allAnn, majority_annotation = consolidate_annotation(
            path, gene_list, geneID_to_description_dict)

        ## geneName majority
        all_geneName, majority_geneName = consolidate_geneName(
            path, gene_list, geneID_to_description_dict)
        #break
        ## extract gain/loss event count
        gene_event = dt_geneEvents[gid]

        ## average length
        #start = time.time()
        geneLength_list = [
            len(igene) for igene in read_fasta(output_path + '%s%s' %
                                               (clusterID, '.fna')).values()
        ]
        geneClusterLength = sum(geneLength_list) // len(geneLength_list)
        #print geneLength_list,geneClusterLength
        #print 'average length:', times(start)

        ## msa
        geneCluster_aln = '%s%s' % (clusterID, '_aa.aln')

        ## check for duplicates
        if gene_count > strain_count:
            duplicated_state = 'yes'
            dup_list = [ig.split('|')[0] for ig in gene_list]
            # "#" to delimit (gene/gene_count)key/value ; "@" to seperate genes
            # Counter({'g1': 2, 'g2': 1})
            dup_detail = ''.join([
                '%s#%s@' % (kd, vd)
                for kd, vd in dict(Counter(dup_list)).items() if vd > 1
            ])[:-1]
        else:
            duplicated_state = 'no'
            dup_detail = ''

        ## locus_tag
        locus_tag_strain = ' '.join([igl for igl in gene_list])
        #locus_tag_strain=' '.join([ '%s_%s'%(igl.split('|')[0],geneId_Dt_to_locusTag[igl]) for igl in gene[1][1] ])

        ## write json
        newline = '{"geneId":%d,"geneLen":%d,"count": %d,"dupli":"%s","dup_detail": "%s","ann":"%s","msa":"%s","divers":"%s","event":"%s","allAnn":"%s", "GName":"%s", "allGName":"%s", "locus":"%s"}'
        write_file_lst_json.write(
            newline %
            (gid + 1, geneClusterLength, strain_count, duplicated_state,
             dup_detail, majority_annotation, geneCluster_aln,
             gene_diversity_Dt[clusterID], gene_event, allAnn,
             majority_geneName, all_geneName, locus_tag_strain))
    write_file_lst_json.write(']')
    write_file_lst_json.close()