def buildMSA(sequences, title='Unknown', labels=None, **kwargs): """ Aligns sequences with clustalw or clustalw2 and returns the resulting MSA. :arg sequences: a file, MSA object or a list or array containing sequences as Atomic objects with :func:`getSequence` or Sequence objects or strings. If strings are used then labels must be provided using ``labels`` :type sequences: :class:`Atomic`, :class:`.MSA`, :class:`~numpy.ndarray`, str :arg title: the title for the MSA and it will be used as the prefix for output files. :type title: str :arg labels: a list of labels to go with the sequences :type labels: list :arg align: whether to align the sequences default True :type align: bool :arg method: alignment method, one of either biopython.align.globalms or clustalw(2). default 'clustalw' :type align: str """ align = kwargs.get('align', True) method = kwargs.pop('method', 'clustalw') # 1. check if sequences are in a fasta file and if not make one if isinstance(sequences, str): filename = sequences elif not isinstance(sequences, MSA): try: max_len = 0 for sequence in sequences: if isinstance(sequence, Atomic): if len(sequence.ca.copy()) > max_len: max_len = len(sequence.ca.copy()) elif isinstance(sequence, MSA): if len(sequence[0]) > max_len: max_len = len(sequence[0]) else: if len(sequence) > max_len: max_len = len(sequence) msa = [] fetched_labels = [] for i, sequence in enumerate(sequences): if isinstance(sequence, Atomic): strseq = sequence.ca.getSequence() label = sequence.getTitle() elif isinstance(sequence, Sequence): strseq = str(sequence) label = sequence.getLabel() elif isinstance(sequence, MSA): strseq = str(sequence[0]) label = sequence.getLabel(0) LOGGER.warn( 'Only the first sequence in the MSA at entry {0} is used.' .format(i)) elif isinstance(sequence, str): strseq = sequence label = str(i + 1) else: raise TypeError('sequences should be a list of strings, ' 'Atomic, or Sequence instances') strseq = strseq + '-' * (max_len - len(strseq)) msa.append(array(list(strseq))) fetched_labels.append(label) sequences = array(msa) except: raise TypeError('sequences should be iterable') # "if a list" is a pythonic way to check if a list is empty or not (or none) if not labels and fetched_labels: labels = fetched_labels label = [label.replace(' ', '_') for label in labels] # labels checkers are removed because they will be properly handled in MSA class initialization msa = MSA(msa=sequences, title=title, labels=labels) if align and 'clustal' in method: filename = writeMSA(title + '.fasta', msa) if align: # 2. find and run alignment method if 'biopython' in method: if len(sequences) == 2: msa, _, _ = alignTwoSequencesWithBiopython( sequences[0], sequences[1], **kwargs) else: raise ValueError( "Provide only two sequences or another method. \ Biopython pairwise alignment can only be used \ to build an MSA with two sequences.") elif 'clustalw' in method: clustalw = which('clustalw') if clustalw is None: if which('clustalw2') is not None: clustalw = which('clustalw2') else: raise EnvironmentError( "The executable for clustalw was not found, \ install clustalw or add it to the path." ) os.system('"%s" %s -OUTORDER=INPUT' % (clustalw, filename)) # 3. parse and return the new MSA msa = parseMSA(title + '.aln') else: alignTool = which(method) if alignTool is None: raise EnvironmentError("The executable for {0} was not found, \ install it or add it to the path.". format(alignTool)) os.system('"%s" %s -OUTORDER=INPUT' % (clustalw, filename)) # 3. parse and return the new MSA msa = parseMSA(title + '.aln') return msa
def alignSequencesByChain(PDBs, **kwargs): """ Runs buildMSA for each chain and optionally joins the results. Returns either a single MSA or a dictionary containing an MSA for each chain. :arg PDBs: a list or array of :class:`AtomGroup` objects or PDB IDs a mixed list containing both is acceptable :type PDBs: list or :class:`~numpy.ndarray` :arg join_chains: whether to join chain alignments default is True :type join_chains: bool :arg join_char: a character for joining chain alignments default is '/' as used by PIR format alignments :type join_char: str """ if not (isinstance(PDBs, list) or isinstance(PDBs, ndarray)): raise TypeError('PDBs should be a list or array') if PDBs == []: raise ValueError('PDBs should not be an empty list') pdbs = [] chains = [] for i, pdb in enumerate(PDBs): if isinstance(pdb, Atomic): pdbs.append(pdb) else: raise TypeError( 'each entry in PDBs must be a :class:`Atomic` instance') chains.append([]) for chain in list(pdbs[i].getHierView()): chains[i].append(chain) if i != 0 and len(chains[i]) != len(chains[0]): raise ValueError('all pdbs should have the same number of chains') labels = [] for pdb in pdbs: chids = '' for chain in list(pdb.getHierView()): chids += chain.getChid() labels.append(pdb.getTitle().split('_')[0] + '_' + chids) chains = array(chains) chain_alignments = [] alignments = {} labels_lists = [] for j in range(len(chains[0])): prefix = 'chain_' + chains[0, j].getChid() msa = buildMSA(chains[:, j], title=prefix, labels=labels) # make all alignments have the sequences in the same order as the 0th labels_lists.append([]) for sequence in msa: labels_lists[j].append(sequence.getLabel()) if j > 0: msaarr = [] for label in labels_lists[0]: msaarr.append(msa.getArray()[msa.getIndex(label)]) msaarr = array(msaarr) msa = MSA(msaarr, title='reordered_msa_1', labels=list(labels_lists[0])) writeMSA(prefix + '.aln', msa) chain_alignments.append(msa) # after reordering, create the alignments dictionary alignments[labels_lists[0][0].split('_')[1][j]] = msa join_chains = kwargs.get('join_chains', True) join_char = kwargs.get('join_char', '/') if join_chains: aligned_sequences = list(zeros(shape(chain_alignments)).T) for j in range(shape(chain_alignments)[1]): aligned_sequences[j] = list(aligned_sequences[j]) orig_labels = [] for i, chain_alignment in enumerate(chain_alignments): for j, sequence in enumerate(chain_alignment): aligned_sequences[j][i] = str(sequence) if i == 0: orig_labels.append(sequence.getLabel()) joined_msaarr = [] for j in range(shape(chain_alignments)[1]): joined_msaarr.append( array(list(join_char.join(aligned_sequences[j])))) joined_msaarr = array(joined_msaarr) result = MSA(joined_msaarr, title='joined_chains', labels=orig_labels) result = refineMSA(result, colocc=1e-9) # remove gap-only cols else: result = alignments return result
def buildMSA(sequences, title='Unknown', labels=None, **kwargs): """ Aligns sequences with clustalw or clustalw2 and returns the resulting MSA. :arg sequences: a file, MSA object or a list or array containing sequences as Atomic objects with :func:`getSequence` or Sequence objects or strings. If strings are used then labels must be provided using ``labels`` :type sequences: :class:`Atomic`, :class:`.MSA`, :class:`~numpy.ndarray`, str :arg title: the title for the MSA and it will be used as the prefix for output files. :type title: str :arg labels: a list of labels to go with the sequences :type labels: list :arg align: whether to do alignment with clustalw(2) default True :type align: bool """ align = kwargs.get('align', True) # 1. check if sequences are in a fasta file and if not make one if isinstance(sequences, str): filename = sequences elif not isinstance(sequences, MSA): try: max_len = 0 for sequence in sequences: if len(sequence) > max_len: max_len = len(sequence) msa = [] fetched_labels = [] for i, sequence in enumerate(sequences): if isinstance(sequence, Atomic): strseq = sequence.getSequence() label = sequence.getTitle() elif isinstance(sequence, Sequence): strseq = str(sequence) label = sequence.getLabel() elif isinstance(sequence, str): strseq = sequence label = str(i + 1) else: raise TypeError('sequences should be a list of strings, ' 'Atomic, or Sequence instances') strseq = strseq + '-' * (max_len - len(strseq)) msa.append(array(list(strseq))) fetched_labels.append(label) sequences = array(msa) except: raise TypeError('sequences should be iterable') # "if a list" is a pythonic way to check if a list is empty or not (or none) if not labels and fetched_labels: labels = fetched_labels # labels checkers are removed because they will be properly handled in MSA class initialization msa = MSA(msa=sequences, title=title, labels=labels) if align: filename = writeMSA(title + '.fasta', msa) if align: # 2. find and run alignment method clustalw = which('clustalw') if clustalw is None: if which('clustalw2') is not None: clustalw = which('clustalw2') else: raise EnvironmentError( "The executable for clustalw was not found, \ install clustalw or add it to the path." ) os.system('"%s" %s' % (clustalw, filename)) # 3. parse and return the new MSA msa = parseMSA(title + '.aln') return msa
def buildMSA(sequences, title='Unknown', labels=None, **kwargs): """ Aligns sequences with clustalw or clustalw2 and returns the resulting MSA. :arg sequences: a file, MSA object or a list or array containing sequences as Atomic objects with :func:`getSequence` or Sequence objects or strings. If strings are used then labels must be provided using ``labels`` :type sequences: :class:`Atomic`, :class:`.MSA`, :class:`~numpy.ndarray`, str :arg title: the title for the MSA and it will be used as the prefix for output files. :type title: str :arg labels: a list of labels to go with the sequences :type labels: list :arg align: whether to align the sequences default True :type align: bool :arg method: alignment method, one of either biopython.align.globalms or clustalw(2). default 'clustalw' :type align: str """ align = kwargs.get('align', True) method = kwargs.pop('method', 'clustalw') # 1. check if sequences are in a fasta file and if not make one if isinstance(sequences, str): filename = sequences elif not isinstance(sequences, MSA): try: max_len = 0 for sequence in sequences: if isinstance(sequence, Atomic): if len(sequence.ca.copy()) > max_len: max_len = len(sequence.ca.copy()) elif isinstance(sequence, MSA): if len(sequence[0]) > max_len: max_len = len(sequence[0]) else: if len(sequence) > max_len: max_len = len(sequence) msa = [] fetched_labels = [] for i, sequence in enumerate(sequences): if isinstance(sequence, Atomic): strseq = sequence.ca.getSequence() label = sequence.getTitle() elif isinstance(sequence, Sequence): strseq = str(sequence) label = sequence.getLabel() elif isinstance(sequence, MSA): strseq = str(sequence[0]) label = sequence.getLabel(0) LOGGER.warn('Only the first sequence in the MSA at entry {0} is used.' .format(i)) elif isinstance(sequence, str): strseq = sequence label = str(i + 1) else: raise TypeError('sequences should be a list of strings, ' 'Atomic, or Sequence instances') strseq = strseq + '-'*(max_len - len(strseq)) msa.append(array(list(strseq))) fetched_labels.append(label) sequences = array(msa) except: raise TypeError('sequences should be iterable') # "if a list" is a pythonic way to check if a list is empty or not (or none) if not labels and fetched_labels: labels = fetched_labels label = [label.replace(' ','_') for label in labels] # labels checkers are removed because they will be properly handled in MSA class initialization msa = MSA(msa=sequences, title=title, labels=labels) if align and 'clustal' in method: filename = writeMSA(title + '.fasta', msa) if align: # 2. find and run alignment method if 'biopython' in method: if len(sequences) == 2: msa, _, _ = alignTwoSequencesWithBiopython(sequences[0], sequences[1], **kwargs) else: raise ValueError("Provide only two sequences or another method. \ Biopython pairwise alignment can only be used \ to build an MSA with two sequences.") elif 'clustalw' in method: clustalw = which('clustalw') if clustalw is None: if which('clustalw2') is not None: clustalw = which('clustalw2') else: raise EnvironmentError("The executable for clustalw was not found, \ install clustalw or add it to the path.") os.system('"%s" %s -OUTORDER=INPUT'%(clustalw, filename)) # 3. parse and return the new MSA msa = parseMSA(title + '.aln') else: alignTool = which(method) if alignTool is None: raise EnvironmentError("The executable for {0} was not found, \ install it or add it to the path.".format(alignTool)) os.system('"%s" %s -OUTORDER=INPUT'%(clustalw, filename)) # 3. parse and return the new MSA msa = parseMSA(title + '.aln') return msa