def alignSequencesByChain(PDBs, **kwargs): """ Runs buildMSA for each chain and optionally joins the results. Returns either a single MSA or a dictionary containing an MSA for each chain. :arg PDBs: a list or array of :class:`AtomGroup` objects or PDB IDs a mixed list containing both is acceptable :type PDBs: list or :class:`~numpy.ndarray` :arg join_chains: whether to join chain alignments default is True :type join_chains: bool :arg join_char: a character for joining chain alignments default is '/' as used by PIR format alignments :type join_char: str """ if not (isinstance(PDBs, list) or isinstance(PDBs, ndarray)): raise TypeError('PDBs should be a list or array') if PDBs == []: raise ValueError('PDBs should not be an empty list') pdbs = [] chains = [] for i, pdb in enumerate(PDBs): if isinstance(pdb, Atomic): pdbs.append(pdb) else: raise TypeError( 'each entry in PDBs must be a :class:`Atomic` instance') chains.append([]) for chain in list(pdbs[i].getHierView()): chains[i].append(chain) if i != 0 and len(chains[i]) != len(chains[0]): raise ValueError('all pdbs should have the same number of chains') labels = [] for pdb in pdbs: chids = '' for chain in list(pdb.getHierView()): chids += chain.getChid() labels.append(pdb.getTitle().split('_')[0] + '_' + chids) chains = array(chains) chain_alignments = [] alignments = {} labels_lists = [] for j in range(len(chains[0])): prefix = 'chain_' + chains[0, j].getChid() msa = buildMSA(chains[:, j], title=prefix, labels=labels) # make all alignments have the sequences in the same order as the 0th labels_lists.append([]) for sequence in msa: labels_lists[j].append(sequence.getLabel()) if j > 0: msaarr = [] for label in labels_lists[0]: msaarr.append(msa.getArray()[msa.getIndex(label)]) msaarr = array(msaarr) msa = MSA(msaarr, title='reordered_msa_1', labels=list(labels_lists[0])) writeMSA(prefix + '.aln', msa) chain_alignments.append(msa) # after reordering, create the alignments dictionary alignments[labels_lists[0][0].split('_')[1][j]] = msa join_chains = kwargs.get('join_chains', True) join_char = kwargs.get('join_char', '/') if join_chains: aligned_sequences = list(zeros(shape(chain_alignments)).T) for j in range(shape(chain_alignments)[1]): aligned_sequences[j] = list(aligned_sequences[j]) orig_labels = [] for i, chain_alignment in enumerate(chain_alignments): for j, sequence in enumerate(chain_alignment): aligned_sequences[j][i] = str(sequence) if i == 0: orig_labels.append(sequence.getLabel()) joined_msaarr = [] for j in range(shape(chain_alignments)[1]): joined_msaarr.append( array(list(join_char.join(aligned_sequences[j])))) joined_msaarr = array(joined_msaarr) result = MSA(joined_msaarr, title='joined_chains', labels=orig_labels) result = refineMSA(result, colocc=1e-9) # remove gap-only cols else: result = alignments return result
def alignSequencesByChain(PDBs, **kwargs): """ Runs :func:`buildMSA` for each chain and optionally joins the results. Returns either a single :class:`MSA` or a dictionary containing an :class:`MSA` for each chain. :arg PDBs: a list of :class:`AtomGroup` objects :type PDBs: list :arg join_chains: whether to join chain alignments default is True :type join_chains: bool :arg join_char: a character for joining chain alignments default is '/' as used by PIR format alignments :type join_char: str """ if isscalar(PDBs): raise TypeError('PDBs should be array-like') if not PDBs: raise ValueError('PDBs should not be empty') pdbs = [] chains = [] for i, pdb in enumerate(PDBs): if isinstance(pdb, Atomic): pdbs.append(pdb) else: raise TypeError( 'each entry in PDBs must be a :class:`Atomic` instance') chains.append([]) for chain in list(pdbs[i].getHierView()): chains[i].append(chain) if i != 0 and len(chains[i]) != len(chains[0]): raise ValueError('all pdbs should have the same number of chains') labels = [] for pdb in pdbs: chids = '' for chain in list(pdb.getHierView()): chids += chain.getChid() labels.append(pdb.getTitle() + '_' + chids) chains = array(chains) chain_alignments = [] alignments = {} for j in range(len(chains[0])): prefix = 'chain_' + chains[0, j].getChid() msa = buildMSA(chains[:, j], title=prefix, labels=labels) msa = refineMSA(msa, colocc=1e-9) # remove gap-only cols chain_alignments.append(msa) alignments[labels[0].split('_')[1][j]] = msa join_chains = kwargs.get('join_chains', True) join_char = kwargs.get('join_char', '/') if len(chains[0]) == 1: join_chains = False if join_chains: joined_msaarr = [] for i, chain_alignment in enumerate(chain_alignments): pdb_seqs = [] for j, sequence in enumerate(chain_alignment): pdb_seqs.append(sequence) joined_msaarr.append(join_char.join(pdb_seqs)) result = MSA(joined_msaarr, title='joined_chains', labels=[label.split('_')[0] for label in labels]) else: result = alignments if len(result) == 1: result = result[list(result.keys())[0]] return result
def alignSequencesByChain(PDBs, **kwargs): """ Runs :func:`buildMSA` for each chain and optionally joins the results. Returns either a single :class:`MSA` or a dictionary containing an :class:`MSA` for each chain. :arg PDBs: a list of :class:`AtomGroup` objects :type PDBs: list :arg join_chains: whether to join chain alignments default is True :type join_chains: bool :arg join_char: a character for joining chain alignments default is '/' as used by PIR format alignments :type join_char: str """ if isscalar(PDBs): raise TypeError('PDBs should be array-like') if not PDBs: raise ValueError('PDBs should not be empty') pdbs = [] chains = [] for i, pdb in enumerate(PDBs): if isinstance(pdb, Atomic): pdbs.append(pdb) else: raise TypeError('each entry in PDBs must be a :class:`Atomic` instance') chains.append([]) for chain in list(pdbs[i].getHierView()): chains[i].append(chain) if i != 0 and len(chains[i]) != len(chains[0]): raise ValueError('all pdbs should have the same number of chains') labels = [] for pdb in pdbs: chids = '' for chain in list(pdb.getHierView()): chids += chain.getChid() labels.append(pdb.getTitle() + '_' + chids) chains = array(chains) chain_alignments = [] alignments = {} for j in range(len(chains[0])): prefix = 'chain_' + chains[0, j].getChid() msa = buildMSA(chains[:, j], title=prefix, labels=labels) msa = refineMSA(msa, colocc=1e-9) # remove gap-only cols chain_alignments.append(msa) alignments[labels[0].split('_')[1][j]] = msa join_chains = kwargs.get('join_chains', True) join_char = kwargs.get('join_char', '/') if len(chains[0]) == 1: join_chains = False if join_chains: joined_msaarr = [] for i, chain_alignment in enumerate(chain_alignments): pdb_seqs = [] for j, sequence in enumerate(chain_alignment): pdb_seqs.append(sequence) joined_msaarr.append(join_char.join(pdb_seqs)) result = MSA(joined_msaarr, title='joined_chains', labels=[label.split('_')[0] for label in labels]) else: result = alignments if len(result) == 1: result = result[list(result.keys())[0]] return result