def make_genepresence_alignment(path, disable_gain_loss,
                                merged_gain_loss_output):
    '''
    loop over all gene clusters and append 0/1 to strain specific
    string used as pseudo alignment of gene presence absence
    '''
    geneClusterPath = '%s%s' % (path, 'protein_fna/diamond_matches/')
    output_path = '%s%s' % (path, 'geneCluster/')

    ## load strain list and prepare for gene presence/absence
    strain_list = load_pickle('%s%s' % (path, 'strain_list.cpk'))
    set_totalStrain = set([istrain for istrain in strain_list])
    totalStrain = len(set_totalStrain)
    dt_strainGene = defaultdict(str)

    sorted_genelist = load_sorted_clusters(path)
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for clusterID, gene in sorted_genelist:
        ## append 0/1 to each strain
        create_genePresence(dt_strainGene, totalStrain, set_totalStrain,
                            gene[1])

    with open('%s%s' % (output_path, 'genePresence.aln'),
              'wb') as presence_outfile:
        for istkey in dt_strainGene:
            write_in_fa(presence_outfile, istkey, dt_strainGene[istkey])
    write_pickle('%s%s' % (output_path, 'dt_genePresence.cpk'), dt_strainGene)

    if disable_gain_loss:
        geneEvents_dt = {i: 0 for i in range(len(sorted_genelist))}
        write_pickle('%s%s' % (output_path, 'dt_geneEvents.cpk'),
                     geneEvents_dt)
        if merged_gain_loss_output:
            gene_loss_fname = '%s%s' % (output_path, 'geneGainLossEvent.json')
            write_json(dt_strainGene, gene_loss_fname, indent=1)
        else:
            ## strainID as key, presence pattern as value (converted into np.array)
            keylist = dt_strainGene.keys()
            keylist.sort()
            strainID_keymap = {ind: k
                               for ind, k in enumerate(keylist)
                               }  # dict(zip(keylist, range(3)))
            presence_arr = np.array([
                np.array(dt_strainGene[k], 'c') for k in keylist
            ])  # 0: present, 3: absent
            presence_arr[presence_arr == '1'] = '3'
            for ind, (clusterID, gene) in enumerate(sorted_genelist):
                pattern_dt = {
                    strainID_keymap[strain_ind]: str(patt)
                    for strain_ind, patt in enumerate(presence_arr[:, ind])
                }
                pattern_fname = '%s%s_patterns.json' % (output_path, clusterID)
                write_json(pattern_dt, pattern_fname, indent=1)
def make_genepresence_alignment(path, disable_gain_loss, merged_gain_loss_output):
    '''
    loop over all gene clusters and append 0/1 to strain specific
    string used as pseudo alignment of gene presence absence
    '''
    geneClusterPath='%s%s'%(path,'protein_fna/diamond_matches/')
    output_path='%s%s'%(path,'geneCluster/');

    ## load strain list and prepare for gene presence/absence
    strain_list= load_pickle('%s%s'%(path,'strain_list.cpk'))
    set_totalStrain=set([ istrain for istrain in strain_list ])
    totalStrain=len(set_totalStrain)
    dt_strainGene= defaultdict(str)

    sorted_genelist = load_sorted_clusters(path)
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for clusterID, gene in sorted_genelist:
        ## append 0/1 to each strain
        create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene[1])

    with open('%s%s'%(output_path,'genePresence.aln'),'wb') as presence_outfile:
        for istkey in dt_strainGene:
            write_in_fa( presence_outfile, istkey, dt_strainGene[istkey])
    write_pickle('%s%s'%(output_path,'dt_genePresence.cpk'), dt_strainGene)

    if disable_gain_loss:
        geneEvents_dt={ i:0 for i in range(len(sorted_genelist)) }
        write_pickle('%s%s'%(output_path,'dt_geneEvents.cpk'), geneEvents_dt)
        if merged_gain_loss_output:
            gene_loss_fname='%s%s'%(output_path,'geneGainLossEvent.json')
            write_json(dt_strainGene, gene_loss_fname, indent=1)
        else:
            ## strainID as key, presence pattern as value (converted into np.array)
            keylist= dt_strainGene.keys(); keylist.sort()
            strainID_keymap= {ind:k for ind, k in enumerate(keylist)} # dict(zip(keylist, range(3)))
            presence_arr= np.array([ np.array(dt_strainGene[k],'c') for k in keylist]) # 0: present, 3: absent
            presence_arr[presence_arr=='1']='3'
            for ind, (clusterID, gene) in enumerate(sorted_genelist):
                pattern_dt= { strainID_keymap[strain_ind]:str(patt) for strain_ind, patt in enumerate(presence_arr[:, ind])}
                pattern_fname='%s%s_patterns.json'%(output_path,clusterID)
                write_json(pattern_dt, pattern_fname, indent=1)
    def export(self, path = '', extra_attr = ['aa_muts','annotation','branch_length','name','accession'], RNA_specific=False):
        ## write tree
        Phylo.write(self.tree, path+self.clusterID+'.nwk', 'newick')

        ## processing node name
        for node in self.tree.get_terminals():
            #node.name = node.ann.split('|')[0]
            node.accession = node.ann.split('|')[0]
            #node.longName = node.ann.split('-')[0]
            node.name = node.ann.split('-')[0]
            #NZ_CP008870|HV97_RS21955-1-fabG_3-ketoacyl-ACP_reductase
            annotation=node.ann.split('-',2)
            if len(annotation)==3:
                node.annotation= annotation[2]
            else:
                node.annotation= annotation[0]

        ## write tree json
        for n in self.tree.root.find_clades():
            if n.branch_length<1e-6:
                n.branch_length = 1e-6
        timetree_fname = path+self.clusterID+'_tree.json'
        tree_json = tree_to_json(self.tree.root, extra_attr=extra_attr)
        write_json(tree_json, timetree_fname, indent=None)

        self.reduce_alignments(RNA_specific)

        ## msa compatible
        for i_aln in self.aln:
            i_aln.id=i_aln.id.replace('|','-',1)
        for i_alnr in self.aln_reduced:
            i_alnr.id=i_alnr.id.replace('|','-',1)

        AlignIO.write(self.aln, path+self.clusterID+'_na_aln.fa', 'fasta')
        AlignIO.write(self.aln_reduced, path+self.clusterID+'_na_aln_reduced.fa', 'fasta')

        if RNA_specific==False:
            for i_aa_aln in self.aa_aln:
                i_aa_aln.id=i_aa_aln.id.replace('|','-',1)
            for i_aa_alnr in self.aa_aln_reduced:
                i_aa_alnr.id=i_aa_alnr.id.replace('|','-',1)

            AlignIO.write(self.aa_aln, path+self.clusterID+'_aa_aln.fa', 'fasta')
            AlignIO.write(self.aa_aln_reduced, path+self.clusterID+'_aa_aln_reduced.fa', 'fasta')

        ## write seq json
        write_seq_json=0
        if write_seq_json:
            elems = {}
            for node in self.tree.find_clades():
                if hasattr(node, "sequence"):
                    if hasattr(node, "longName")==False:
                        node.longName=node.name
                    elems[node.longName] = {}
                    nuc_dt= {pos:state for pos, (state, ancstate) in
                                    enumerate(izip(node.sequence.tostring(), self.tree.root.sequence.tostring())) if state!=ancstate}
                    nodeseq=node.sequence.tostring();nodeseq_len=len(nodeseq)
                    elems[node.longName]['nuc']=nuc_dt

            elems['root'] = {}
            elems['root']['nuc'] = self.tree.root.sequence.tostring()

            self.sequences_fname=path+self.clusterID+'_seq.json'
            write_json(elems, self.sequences_fname, indent=None)
Пример #4
0
def export_gain_loss(tree, path, merged_gain_loss_output):
    '''
    '''
    # write final tree with internal node names as assigned by treetime
    sep = '/'
    output_path = sep.join([path.rstrip(sep), 'geneCluster/'])
    events_dict_path = sep.join([output_path, 'dt_geneEvents.cpk'])
    gene_pattern_dict_path = sep.join([output_path, 'dt_genePattern.cpk'])

    tree_fname = sep.join([output_path, 'strain_tree.nwk'])
    Phylo.write(tree.tree, tree_fname, 'newick')

    gene_gain_loss_dict = defaultdict(str)
    preorder_strain_list = []  #store the preorder nodes as strain list
    for node in tree.tree.find_clades(
            order='preorder'):  # order does not matter much here
        if node.up is None: continue
        #print(node.name ,len(node.geneevents),node.geneevents)
        gain_loss = [
            str(int(ancestral) * 2 + int(derived)) for ancestral, derived in
            zip(node.up.genepresence, node.genepresence)
        ]
        gene_gain_loss_dict[node.name] = "".join(gain_loss)
        preorder_strain_list.append(node.name)

    gain_loss_array = np.array(
        [[i for i in gain_loss_str]
         for gain_loss_str in gene_gain_loss_dict.values()],
        dtype=int)
    # 1 and 2 are codes for gain/loss events
    events_array = ((gain_loss_array == 1) |
                    (gain_loss_array == 2)).sum(axis=0)
    events_dict = {index: event for index, event in enumerate(events_array)}

    write_pickle(events_dict_path, events_dict)

    if merged_gain_loss_output:
        ## export gene loss dict to json for visualization
        #gene_loss_fname = sep.join([ output_path, 'geneGainLossEvent.json'])
        #write_json(gene_gain_loss_dict, gene_loss_fname, indent=1)
        write_pickle(gene_pattern_dict_path, gene_gain_loss_dict)
    else:
        ## strainID as key, presence pattern as value (converted into np.array)
        sorted_genelist = load_sorted_clusters(path)
        strainID_keymap = {
            ind: k
            for ind, k in enumerate(preorder_strain_list)
        }
        #presence_arr= np.array([ np.fromstring(gene_gain_loss_dict[k], np.int8)-48 for k in preorder_strain_list])
        presence_arr = np.array([
            np.array(gene_gain_loss_dict[k], 'c') for k in preorder_strain_list
        ])
        ## if true, write pattern dict instead of pattern string in a json file
        pattern_json_flag = False
        for ind, (clusterID, gene) in enumerate(sorted_genelist):
            pattern_fname = '%s%s_patterns.json' % (output_path, clusterID)
            if pattern_json_flag:
                pattern_dt = {
                    strainID_keymap[strain_ind]: str(patt)
                    for strain_ind, patt in enumerate(presence_arr[:, ind])
                }
                write_json(pattern_dt, pattern_fname, indent=1)
            #print(preorder_strain_list,clusterID)
            #print(''.join([ str(patt) for patt in presence_arr[:, ind]]))
            with open(pattern_fname, 'w') as write_pattern:
                write_pattern.write(
                    '{"patterns":"' +
                    ''.join([str(patt)
                             for patt in presence_arr[:, ind]]) + '"}')
    def export(self, path = '', extra_attr = ['aa_muts','annotation','branch_length','name','accession'], RNA_specific=False):
        ## write tree
        Phylo.write(self.tree, path+self.clusterID+'.nwk', 'newick')

        ## processing node name
        for node in self.tree.get_terminals():
            #node.name = node.ann.split('|')[0]
            node.accession = node.ann.split('|')[0]
            #node.longName = node.ann.split('-')[0]
            node.name = node.ann.split('-')[0]
            #NZ_CP008870|HV97_RS21955-1-fabG_3-ketoacyl-ACP_reductase
            annotation=node.ann.split('-',2)
            if len(annotation)==3:
                node.annotation= annotation[2]
            else:
                node.annotation= annotation[0]

        ## write tree json
        for n in self.tree.root.find_clades():
            if n.branch_length<1e-6:
                n.branch_length = 1e-6
        timetree_fname = path+self.clusterID+'_tree.json'
        tree_json = tree_to_json(self.tree.root, extra_attr=extra_attr)
        write_json(tree_json, timetree_fname, indent=None)

        self.reduce_alignments(RNA_specific)

        ## msa compatible
        for i_aln in self.aln:
            i_aln.id=i_aln.id.replace('|','-',1)
        for i_alnr in self.aln_reduced:
            i_alnr.id=i_alnr.id.replace('|','-',1)

        AlignIO.write(self.aln, path+self.clusterID+'_na_aln.fa', 'fasta')
        AlignIO.write(self.aln_reduced, path+self.clusterID+'_na_aln_reduced.fa', 'fasta')

        if RNA_specific==False:
            for i_aa_aln in self.aa_aln:
                i_aa_aln.id=i_aa_aln.id.replace('|','-',1)
            for i_aa_alnr in self.aa_aln_reduced:
                i_aa_alnr.id=i_aa_alnr.id.replace('|','-',1)

            AlignIO.write(self.aa_aln, path+self.clusterID+'_aa_aln.fa', 'fasta')
            AlignIO.write(self.aa_aln_reduced, path+self.clusterID+'_aa_aln_reduced.fa', 'fasta')

        ## write seq json
        write_seq_json=0
        if write_seq_json:
            elems = {}
            for node in self.tree.find_clades():
                if hasattr(node, "sequence"):
                    if hasattr(node, "longName")==False:
                        node.longName=node.name
                    elems[node.longName] = {}
                    nuc_dt= {pos:state for pos, (state, ancstate) in
                                    enumerate(izip(node.sequence.tostring(), self.tree.root.sequence.tostring())) if state!=ancstate}
                    nodeseq=node.sequence.tostring();nodeseq_len=len(nodeseq)
                    elems[node.longName]['nuc']=nuc_dt

            elems['root'] = {}
            elems['root']['nuc'] = self.tree.root.sequence.tostring()

            self.sequences_fname=path+self.clusterID+'_seq.json'
            write_json(elems, self.sequences_fname, indent=None)