示例#1
0
def make_genepresence_alignment(path):
    '''
    loop over all gene clusters and append 0/1 to strain specific
    string used as pseudo alignment of gene presence absence
    '''
    geneClusterPath = '%s%s' % (path, 'protein_fna/diamond_matches/')
    output_path = '%s%s' % (path, 'geneCluster/')

    ## load strain list and prepare for gene presence/absence
    strain_list = load_pickle(path + 'strain_list.cpk')
    set_totalStrain = set([istrain for istrain in strain_list])
    totalStrain = len(set_totalStrain)
    dt_strainGene = defaultdict(list)

    sorted_genelist = load_sorted_clusters(path)
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for gid, (clusterID, gene) in enumerate(sorted_genelist):
        gene_list = gene[1]
        ## append 0/1 to each strain
        dt_strainGene = create_genePresence(dt_strainGene, totalStrain,
                                            set_totalStrain, gene_list)

    with open(output_path + 'genePresence.aln', 'wb') as presence_outfile:
        for istkey in dt_strainGene:
            dt_strainGene[istkey] = ''.join(dt_strainGene[istkey])
            write_in_fa(presence_outfile, istkey, dt_strainGene[istkey])

    write_pickle(output_path + 'dt_genePresence.cpk', dt_strainGene)
def create_split_cluster_files(file_path, fname,
                               gene_list1, gene_list2, diamond_geneCluster_dt):
    """
    delete the old cluster and create two new clusters
    params:
        new_fa_files: list to which new file names are appeneded
        gene_list1/2: lists containing the genes in the new split clusters
        diamond_geneCluster_dt: cluster dictionary to be updated
    """
    orgin_nwk_name = fname.split('/')[-1]
    clusterID = orgin_nwk_name.replace('.nwk','')
    origin_cluster_nu_fa = orgin_nwk_name.replace('nwk','fna')
    origin_cluster_aa_fa = orgin_nwk_name.replace('nwk','faa')

    split_fa_files_set=set()
    #print 'xxxx', clusterID
    try:
        print('deleting:',orgin_nwk_name,gene_list1,gene_list2, clusterID)
        del diamond_geneCluster_dt[clusterID]
    except:
        print("can't delete",orgin_nwk_name,gene_list1,gene_list2, clusterID)

    ## write new cluster fa files
    origin_nu_fa_dt = read_fasta(file_path+origin_cluster_nu_fa)
    origin_aa_fa_dt = read_fasta(file_path+origin_cluster_aa_fa)
    sgs_index=0

    ## split_gene_list has geneSeqID instead of geneID
    for split_gene_list in (list(gene_list1), list(gene_list2)):
        sgs_index+=1
        newClusterId="%s_%s"%(clusterID,sgs_index)
        gene_cluster_nu_filename="%s%s"%(newClusterId,'.fna')
        gene_cluster_aa_filename="%s%s"%(newClusterId,'.faa')
        gene_cluster_nu_write=open( file_path+gene_cluster_nu_filename, 'wb')
        gene_cluster_aa_write=open( file_path+gene_cluster_aa_filename, 'wb')

        split_fa_files_set |=  set([file_path+gene_cluster_nu_filename])

        ## write new split cluster files
        for gene_memb in split_gene_list:
            write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb])
            write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb])
        gene_cluster_nu_write.close(); gene_cluster_aa_write.close();

        diamond_geneCluster_dt[ newClusterId ] = [0,[],0]
        ## num_stains
        diamond_geneCluster_dt[ newClusterId ][0]=len(dict(Counter([ ig.split('|')[0] for ig in split_gene_list])).keys())
        ## num_genes
        diamond_geneCluster_dt[ newClusterId ][2]=len(dict(Counter([ ig for ig in split_gene_list])).keys())
        ## gene members
        diamond_geneCluster_dt[ newClusterId ][1]=[ ig.split('-')[0] for ig in split_gene_list ]
    return split_fa_files_set
    def create_geneCluster_fa():
        """ dict storing amino_acid Id/Seq from '.faa' files
            input: '.faa', '_gene_nuc_dict.cpk', '-orthamcl-allclusters.cpk'
            output:
        """
        ## make sure the geneCluster folder is empty
        if os.path.isdir(path+'geneCluster/')==True:
            print 'remove previous folder: ',path+'geneCluster/'
            os.system('rm -rf %s'%(path+'geneCluster/'))

        faa_path=path+'protein_faa/'
        ## dict storing all genes' translation
        gene_aa_dict=defaultdict(list)
        for ifaa in glob.glob(faa_path+"*.faa"):
            gene_aa_dict.update(read_fasta(ifaa))

        ## dict storing nucleotide Id/Seq from '_gene_nuc_dict.cpk' files
        istrain_cpk={}; strain_list= load_pickle(path+'strain_list.cpk');
        nucleotide_dict_path= '%s%s'%(path,'nucleotide_fna/')
        for istrain in strain_list:
            istrain_cpk[istrain]=load_pickle(nucleotide_dict_path+istrain+'_gene_nuc_dict.cpk')

        ## load gene cluster cpk file
        geneCluster_path=faa_path+'diamond_matches/'
        diamond_geneCluster_dt=load_pickle(geneCluster_path+'orthamcl-allclusters.cpk')

        ## load geneID_to_geneSeqID geneSeqID cpk file
        geneID_to_geneSeqID_dict=load_pickle(path+'geneID_to_geneSeqID.cpk')

        ## create cluster-genes fasta files
        fasta_path=path+'geneCluster/'; os.system('mkdir '+fasta_path)
        ## diamond_geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes }
        for clusterID, gene in diamond_geneCluster_dt.iteritems():
            ## geneCluster file name
            gene_cluster_nu_filename="%s%s"%(clusterID,'.fna')
            gene_cluster_aa_filename="%s%s"%(clusterID,'.faa')
            gene_cluster_nu_write=open( fasta_path+gene_cluster_nu_filename, 'wb')
            gene_cluster_aa_write=open( fasta_path+gene_cluster_aa_filename, 'wb')
            ## write nucleotide/amino_acid sequences into geneCluster files
            for gene_memb in gene[1]:
                ## gene_name format: strain_1|locusTag
                strain_name= gene_memb.split('|')[0]
                gene_memb_seq=str(istrain_cpk[strain_name][gene_memb])
                geneSeqID=geneID_to_geneSeqID_dict[gene_memb]
                write_in_fa(gene_cluster_nu_write, geneSeqID, gene_memb_seq )
                write_in_fa(gene_cluster_aa_write,geneSeqID, gene_aa_dict[gene_memb])
            gene_cluster_nu_write.close(); gene_cluster_aa_write.close();
def create_RNACluster_fa(path):
    """
        input: '.fna', '_RNA_nuc_dict.cpk', '-orthamcl-allclusters.cpk'
        output: '.aln', 'tree.json', etc
    """
    if 0:
        ## make sure the RNACluster folder is empty
        if os.path.isdir(path + 'RNACluster/') == True:
            print 'remove previous folder: ', path + 'RNACluster/'
            os.system('rm -rf %s' % (path + 'RNACluster/'))

    ## dict storing nucleotide Id/Seq from '_RNA_nuc_dict.cpk' files
    istrain_cpk = {}
    strain_list = load_pickle(path + 'strain_list.cpk')
    nucleotide_dict_path = '%s%s' % (path, 'nucleotide_fna/')
    for istrain in strain_list:
        istrain_cpk[istrain] = load_pickle(nucleotide_dict_path + istrain +
                                           '_RNA_nuc_dict.cpk')

    ## load RNA cluster cpk file
    RNACluster_path = path + 'RNA_fna/'
    diamond_RNACluster_dt = load_pickle(RNACluster_path +
                                        'orthamcl-allclusters.cpk')

    ## load RNAID_to_RNASeqID RNASeqID cpk file
    RNAID_to_RNASeqID_dict = load_pickle(path + 'RNAID_to_SeqID.cpk')

    ## create cluster-RNAs fasta files (By default: put RNAs in geneCluster folder)
    fasta_path = path + 'geneCluster/'
    ## diamond_RNACluster_dt: {clusterID:[ count_strains,[memb1,...],count_RNAs }
    for clusterID, RNA in diamond_RNACluster_dt.iteritems():
        ## RNACluster file name
        RNA_cluster_nu_filename = "%s%s" % (clusterID, '.fna')
        RNA_cluster_nu_write = open(fasta_path + RNA_cluster_nu_filename, 'wb')
        ## write nucleotide/amino_acid sequences into RNACluster files
        for RNA_memb in RNA[1]:
            ## RNA_name format: strain_1|locusTag
            strain_name = RNA_memb.split('|')[0]
            RNA_memb_seq = str(istrain_cpk[strain_name][RNA_memb])
            RNASeqID = RNAID_to_RNASeqID_dict[RNA_memb]
            write_in_fa(RNA_cluster_nu_write, RNASeqID, RNA_memb_seq)
        RNA_cluster_nu_write.close()
    return diamond_RNACluster_dt
def align_and_makeTree(thread, alignFile_path, fa_files_list):
    for gene_cluster_nu_filename in fa_files_list:
        try:
            # extract GC_00002 from path/GC_00002.aln
            clusterID = gene_cluster_nu_filename.split('/')[-1].split('.')[0]
            start = time.time();
            geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a')
            if len( read_fasta(gene_cluster_nu_filename) )==1: # nothing to do for singletons
                ## na.aln
                gene_cluster_nu_aln_filename= gene_cluster_nu_filename.replace('.fna','_na.aln')
                ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(gene_cluster_nu_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(gene_cluster_nu_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)

                ## aa.aln
                gene_cluster_aa_filename= gene_cluster_nu_filename.replace('.fna','.faa')
                gene_cluster_aa_aln_filename= gene_cluster_nu_filename.replace('.fna','_aa.aln')
                ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(gene_cluster_aa_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(gene_cluster_aa_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)

                geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0'))

            else: # align and build tree
                print gene_cluster_nu_filename
                myTree = mpm_tree(gene_cluster_nu_filename)
                myTree.codon_align()
                myTree.translate()
                myTree.build(raxml=False)
                myTree.ancestral(translate_tree=True)
                myTree.refine()
                myTree.export(path=alignFile_path)
                myTree.diversity_statistics()
                diversity=myTree.diversity
                gene_diversity_values='{0:.3f}'.format(diversity)
                geneDiversity_file.write('%s\t%s\n'%(clusterID,gene_diversity_values))
        except:
            print("Aligning and tree building of %s failed"%gene_cluster_nu_filename)
def create_split_unclustered_files(file_path, fname, gene_list,
                                   diamond_geneCluster_dt,
                                   merged_clusters_dict):
    """
    delete the unclustered file and create new clusters
    params:
        gene_list: lists containing the genes in the new split clusters
        diamond_geneCluster_dt: cluster dictionary to be updated
        merged_clusters_dict: merged_clusters_dict: dictionary of merged clusters (key) with original clusterIDs (value),
            which is used to delete the old unclustered items in diamond_geneCluster_dt
    """
    origin_uncluster_nwk_name = fname.split('/')[-1]
    clusterID = origin_uncluster_nwk_name.replace('.fna', '')
    origin_uncluster_nu_fa = origin_uncluster_nwk_name
    origin_uncluster_aa_fa = origin_uncluster_nwk_name.replace('fna', 'faa')

    split_fa_files_set = set()
    try:
        ## delete under-clustered clusters
        for cluster_needed_deletion in merged_clusters_dict[
                origin_uncluster_nwk_name]:
            if cluster_needed_deletion in diamond_geneCluster_dt:
                del diamond_geneCluster_dt[cluster_needed_deletion]
                print('deleting:', cluster_needed_deletion, ' gathered in ',
                      origin_uncluster_nwk_name)
    except:
        #print("can't delete",origin_uncluster_nwk_name,gene_list, clusterID)
        print("can't delete", " under_clusterd genes gathered in ",
              origin_uncluster_nwk_name)

    ## write new cluster fa files
    origin_nu_fa_dt = read_fasta(file_path + origin_uncluster_nu_fa)
    origin_aa_fa_dt = read_fasta(file_path + origin_uncluster_aa_fa)
    #print gene_list
    ## split_gene_list has geneSeqID instead of geneID
    for sgs_index, split_gene_list in enumerate(gene_list, 1):
        newClusterId = "%s_%s" % (clusterID, sgs_index)
        gene_cluster_nu_filename = "%s%s" % (newClusterId, '.fna')
        gene_cluster_aa_filename = "%s%s" % (newClusterId, '.faa')
        gene_cluster_nu_write = open(file_path + gene_cluster_nu_filename,
                                     'wb')
        gene_cluster_aa_write = open(file_path + gene_cluster_aa_filename,
                                     'wb')

        split_fa_files_set.add(file_path + gene_cluster_nu_filename)

        ## write new split cluster files
        for gene_memb in split_gene_list:
            write_in_fa(gene_cluster_nu_write, gene_memb,
                        origin_nu_fa_dt[gene_memb])
            write_in_fa(gene_cluster_aa_write, gene_memb,
                        origin_aa_fa_dt[gene_memb])
        gene_cluster_nu_write.close()
        gene_cluster_aa_write.close()

        diamond_geneCluster_dt[newClusterId] = [0, [], 0]
        ## num_stains
        diamond_geneCluster_dt[newClusterId][0] = len(
            dict(Counter([ig.split('|')[0] for ig in split_gene_list])).keys())
        ## num_genes
        diamond_geneCluster_dt[newClusterId][2] = len(
            dict(Counter([ig for ig in split_gene_list])).keys())
        ## gene members
        diamond_geneCluster_dt[newClusterId][1] = [
            ig.split('-')[0] for ig in split_gene_list
        ]
    return split_fa_files_set
def create_core_SNP_matrix(path):
    """ create SNP matrix using core gene SNPs
        input: strain_list.cpk, core_geneList.cpk
        output: SNP_whole_matrix.aln
    """
    import os, sys, operator
    import numpy as np
    from collections import defaultdict
    from SF00_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa

    alnFilePath = '%s%s' % (path, 'geneCluster/')
    output_path = alnFilePath

    ## create core gene list
    corelist = []
    totalStrain = len(load_pickle(path + 'strain_list.cpk'))
    sorted_geneList = load_sorted_clusters(path)
    with open(output_path + 'core_geneList.txt', 'wb') as outfile:
        for clusterID, vg in sorted_geneList:
            if vg[0] == totalStrain and vg[2] == totalStrain:
                coreGeneName = '%s%s' % (clusterID, '_na.aln')
                ## sequences might be discarded because of premature stops
                coreGeneName_path = alnFilePath + coreGeneName
                if os.path.exists(coreGeneName_path) and len(
                        read_fasta(coreGeneName_path)) == totalStrain:
                    outfile.write(coreGeneName + '\n')
                    corelist.append(coreGeneName)
                else:
                    print '%s%s%s' % ('warning: ', coreGeneName_path,
                                      ' is not a core gene')
        write_pickle(output_path + 'core_geneList.cpk', corelist)

    refSeqList = load_pickle(path + 'strain_list.cpk')
    refSeqList.sort()

    snp_fre_lst = []
    snp_wh_matrix_flag = 0
    snp_pos_dt = defaultdict(list)
    snp_whole_matrix = np.array([])

    snps_by_gene = []
    for align_file in corelist:  ## all core genes
        fa_dt = read_fasta(alnFilePath + align_file)
        fa_sorted_lst = sorted(fa_dt.items(), key=lambda x: x[0].split('|')[0])
        nuc_array = np.array([])
        flag = 0
        for ka, va in enumerate(fa_sorted_lst):
            if flag == 0:
                flag = 1
                nuc_array = np.array(np.fromstring(va[1], dtype='S1'))
            else:
                nuc_array = np.vstack(
                    (nuc_array, np.fromstring(va[1], dtype='S1')))

        position_polymorphic = np.where(
            np.all(nuc_array == nuc_array[0, :], axis=0) == False)[0]
        position_has_gap = np.where(np.any(nuc_array == '-', axis=0))[0]
        position_SNP = np.setdiff1d(position_polymorphic, position_has_gap)
        snp_columns = nuc_array[:, position_SNP]
        snp_pos_dt[align_file] = position_SNP

        if snp_wh_matrix_flag == 0:
            snp_whole_matrix = snp_columns
            snp_wh_matrix_flag = 1
        else:
            snp_whole_matrix = np.hstack((snp_whole_matrix, snp_columns))

    write_pickle(output_path + 'snp_pos.cpk', snp_pos_dt)

    with open(output_path + 'SNP_whole_matrix.aln', 'wb') as outfile:
        for ind, isw in enumerate(snp_whole_matrix):
            write_in_fa(outfile, refSeqList[ind], isw.tostring())
示例#8
0
def gbk_translation(each_gbk_path, nucleotide_dict_path, gb_file,
                    output_filename, output_filename2,
                    geneID_to_geneSeqID_dict, geneID_to_description_dict,
                    RNAID_to_SeqID_dict, RNAID_to_description_dict,
                    disable_RNA_clustering):
    '''
    extract sequences and meta informations of all genes in one reference genbank file
    params:
        - each_gbk_path:    path to the set of reference sequences used to construct
                            the core genome
        - nucleotide_dict_path:
                            path to the cPickled dicts of all nucleotide sequences 
                            for each genome
        - gb_file:          name of the reference to be analyzed
        - output_filename:  file into which all amino acid sequences are written
                            in fasta format. needed as input for diamond
        - output_filename2: RNA nucleotide_sequences are written in fasta format.
                            Needed as RNA_blast_input
        - geneID_to_geneSeqID_dict: dictionary linking geneID to gene sequence ID
                            modified in place (key: geneID; value: geneSeqID )
        - geneID_to_description_dict: dictionary linking geneID to description info
                            modified in place (key: geneID; value: a dict including
                            information on contig_index, annotation or more)
        - RNAID_to_SeqID_dict: dictionary linking RNAID to RNA sequence ID
                            modified in place (key: RNAID; value: SeqID )
        - RNAID_to_description_dict: dictionary linking RNAID to description info
                            modified in place (key: RNAID; value: a dict including
                            information on contig_index, annotation or more)
        - disable_RNA_clustering: not cluster rRNA and tRNA (default: 0 -> cluster RNAs)
    '''

    reference_gb = '%s%s' % (each_gbk_path, gb_file)
    strainName = gb_file.split('.gbk')[0]
    gene_nuc_seq_dict = '%s%s_gene_nuc_dict.cpk' % (nucleotide_dict_path,
                                                    strainName)
    gene_nucleotide_sequences = defaultdict()
    aa_sequence_file = open(output_filename, 'wb')

    if disable_RNA_clustering == 0:
        RNA_nuc_seq_dict = '%s%s_RNA_nuc_dict.cpk' % (nucleotide_dict_path,
                                                      strainName)
        RNA_nucleotide_sequences = defaultdict()
        RNA_sequence_file = open(output_filename2, 'wb')

    contig_index = 0
    for contig in SeqIO.parse(reference_gb, 'genbank'):
        contig_index += 1
        for feature in contig.features:
            if feature.type == 'CDS':
                if 'product' in feature.qualifiers and 'translation' in feature.qualifiers:
                    if 'gene' in feature.qualifiers:
                        geneName = '%s' % (
                            feature.qualifiers['gene'][0]).replace(' ', '_')
                    else:
                        geneName = ''
                    product = feature.qualifiers['product'][0]
                    annotation = '_'.join(product.split(' '))
                    trans_seq = feature.qualifiers['translation'][0]
                    locus_tag = feature.qualifiers['locus_tag'][0]
                    if "PROKKA" in locus_tag:
                        locus_tag = locus_tag.replace('PROKKA_', '')
                    if '%s_' % strainName in locus_tag:
                        locus_tag = locus_tag.split('%s_' % strainName)[1]
                    ## geneID is composed of strain_name and locus_tag
                    ## Keeping '|' separator is important, which is used later in orthAgogue.
                    geneID = '%s|%s' % (strainName, locus_tag)
                    write_in_fa(aa_sequence_file, geneID, trans_seq)
                    # give tag 'gname:' to genes which have gene name and separate it from annotation
                    geneID_to_description_dict[geneID] = {
                        'geneName': geneName,
                        'contig': contig_index,
                        'annotation': annotation
                    }
                    if geneName != '':
                        geneName = '%s_' % geneName
                    geneID_to_geneSeqID_dict[geneID] = '%s|%s-%d-%s%s' % (
                        strainName, locus_tag, contig_index, geneName,
                        annotation)

                    gene_nucleotide_sequences[geneID] = feature.extract(
                        contig.seq)
            elif not disable_RNA_clustering and (feature.type == 'rRNA'
                                                 or feature.type == 'tRNA'):
                if 'product' in feature.qualifiers:
                    geneName = ''
                    product = feature.qualifiers['product'][0]
                    annotation = '_'.join(product.split(' '))
                    locus_tag = feature.qualifiers['locus_tag'][0]
                    if "PROKKA" in locus_tag:
                        locus_tag = locus_tag.replace('PROKKA_', '')
                    if '%s_' % strainName in locus_tag:
                        locus_tag = locus_tag.split('%s_' % strainName)[1]
                    ## RNA is composed of strain_name and locus_tag
                    ## Keeping '|' separator is important, which is used later in orthAgogue.
                    RNAID = '%s|%s' % (strainName, locus_tag)
                    RNA_seq = str(feature.extract(contig.seq))
                    write_in_fa(RNA_sequence_file, RNAID, RNA_seq)
                    # give tag 'gname:' to genes which have gene name and separate it from annotation
                    RNAID_to_description_dict[RNAID] = {
                        'geneName': '',
                        'contig': contig_index,
                        'annotation': annotation
                    }
                    RNAID_to_SeqID_dict[RNAID] = '%s|%s-%d-%s%s' % (
                        strainName, locus_tag, contig_index, geneName,
                        annotation)
                    RNA_nucleotide_sequences[RNAID] = RNA_seq

    write_pickle(gene_nuc_seq_dict, gene_nucleotide_sequences)
    if disable_RNA_clustering == 0:
        write_pickle(RNA_nuc_seq_dict, RNA_nucleotide_sequences)
    aa_sequence_file.close()