def test_toString(self): """CharAlphabet toString should convert an input array to string""" r = CharAlphabet('UCAG') self.assertEqual(r.toString(array([[0,0,1],[0,3,2]], 'B')), 'UUC\nUGA') #should work with single seq self.assertEqual(r.toString(array([[0,0,1,0,3,2]], 'B')), 'UUCUGA') #should work with single seq self.assertEqual(r.toString(array([0,0,1,0,3,2], 'B')), 'UUCUGA') #should work with empty seq self.assertEqual(r.toString(array([], 'B')), '')
def test_isValid(self): """CharAlphabet isValid should return True for valid sequence""" a = CharAlphabet('bca') self.assertEqual(a.isValid(''), True) self.assertEqual(a.isValid('bbb'), True) self.assertEqual(a.isValid('bbbaac'), True) self.assertEqual(a.isValid('bbd'), False) self.assertEqual(a.isValid('d'), False) self.assertEqual(a.isValid(['a', 'b']), True) self.assertEqual(a.isValid(['a', None]), False)
def test_triples(self): """triples should cache the same object.""" r = CharAlphabet('UCAG') rt = r.Triples self.assertEqual(len(rt), 64) rt2 = r.Triples self.assertSameObj(rt, rt2)
def test_pairs(self): """pairs should cache the same object.""" r = CharAlphabet('UCAG') rp = r.Pairs self.assertEqual(len(rp), 16) rp2 = r.Pairs self.assertSameObj(rp, rp2)
def test_init(self): """CharAlphabet init should make correct translation tables""" r = CharAlphabet('UCAG') i2c, c2i = r._indices_to_chars, r._chars_to_indices s = array([0,0,1,0,3,2], 'b').tostring() self.assertEqual(s.translate(i2c), 'UUCUGA') self.assertEqual('UUCUGA'.translate(c2i), '\000\000\001\000\003\002')
def test_toChars(self): """CharAlphabet toChars should convert an input array to chars""" r = CharAlphabet('UCAG') c = r.toChars(array([[0,0,1],[0,3,2]], 'B')) self.assertEqual(c, \ array(['UUC','UGA'], 'c'))
def test_fromArray(self): """CharAlphabet fromArray should return correct array""" r = CharAlphabet('UCAG') self.assertEqual(r.fromArray(array(['UUC','UGA'], 'c')), \ array([[0,0,1],[0,3,2]], 'B'))
def test_fromString(self): """CharAlphabet fromString should return correct array""" r = CharAlphabet('UCAG') self.assertEqual(r.fromString('UUCUGA'), array([0,0,1,0,3,2],'B'))
def __init__(self, motifset, Gap=IUPAC_gap, Missing=IUPAC_missing,\ Gaps=None, Sequence=None, Ambiguities=None, label=None, Complements=None, Pairs=None, MWCalculator=None, \ add_lower=False, preserve_existing_moltypes=False, \ make_alphabet_group=False, ModelSeq=None): """Returns a new MolType object. Note that the parameters are in flux. Currently: motifset: Alphabet or sequence of items in the default alphabet. Does not include degenerates. Gap: default gap symbol Missing: symbol for missing data Gaps: any other symbols that should be treated as gaps (doesn't have to include Gap or Missing; they will be silently added) Sequence: Class for constructing sequences. Ambiguities: dict of char:tuple, doesn't include gaps (these are hard-coded as - and ?, and added later. label: text label, don't know what this is used for. Unnecessary? Complements: dict of symbol:symbol showing how the non-degenerate single characters complement each other. Used for constructing on the fly the complement table, incl. support for mustPair and canPair. Pairs: dict in which keys are pairs of symbols that can pair with each other, values are True (must pair) or False (might pair). Currently, the meaning of GU pairs as 'weak' is conflated with the meaning of degenerate symbol pairs (which might pair with each other but don't necessarily, depending on how the symbol is resolved). This should be refactored. MWCalculator: f(seq) -> molecular weight. add_lower: if True (default: False) adds the lowercase versions of everything into the alphabet. Slated for deletion. preserve_existing_moltypes: if True (default: False), does not set the MolType of the things added in **kwargs to self. make_alphabet_group: if True, makes an AlphabetGroup relating the various alphabets to one another. ModelSeq: sequence type for modeling Note on "Degenerates" versus "Ambiguities": self.Degenerates contains _only_ mappings for degenerate symbols, whereas self.Ambiguities contains mappings for both degenerate and non-degenerate symbols. Sometimes you want one, sometimes the other, so both are provided. """ self.Gap = Gap self.Missing = Missing self.Gaps = frozenset([Gap, Missing]) if Gaps: self.Gaps = self.Gaps.union(frozenset(Gaps)) self.label = label #set the sequence constructor if Sequence is None: Sequence = ''.join #safe default string constructor elif not preserve_existing_moltypes: Sequence.MolType = self self.Sequence = Sequence #set the ambiguities ambigs = {self.Missing:tuple(motifset)+(self.Gap,),self.Gap:(self.Gap,)} if Ambiguities: ambigs.update(Ambiguities) for c in motifset: ambigs[c] = (c,) self.Ambiguities = ambigs #set Complements -- must set before we make the alphabet group self.Complements = Complements or {} if make_alphabet_group: #note: must use _original_ ambiguities here self.Alphabets = AlphabetGroup(motifset, Ambiguities, \ MolType=self) self.Alphabet = self.Alphabets.Base else: if isinstance(motifset, Enumeration): self.Alphabet = motifset elif max(len(motif) for motif in motifset) == 1: self.Alphabet = CharAlphabet(motifset, MolType=self) else: self.Alphabet = Alphabet(motifset, MolType=self) #set the other properties self.Degenerates = Ambiguities and Ambiguities.copy() or {} self.Degenerates[self.Missing] = ''.join(motifset)+self.Gap self.Matches = make_matches(motifset, self.Gaps, self.Degenerates) self.Pairs = Pairs and Pairs.copy() or {} self.Pairs.update(make_pairs(Pairs, motifset, self.Gaps, \ self.Degenerates)) self.MWCalculator = MWCalculator #add lowercase characters, if we're doing that if add_lower: self._add_lowercase() #cache various other data that make the calculations faster self._make_all() self._make_comp_table() # a gap can be a true gap char or a degenerate character, typically '?' # we therefore want to ensure consistent treatment across the definition # of characters as either gap or degenerate self.GapString = ''.join(self.Gaps) strict_gap = "".join(set(self.GapString) - set(self.Degenerates)) self.stripDegenerate = FunctionWrapper( keep_chars(strict_gap+''.join(self.Alphabet))) self.stripBad = FunctionWrapper(keep_chars(''.join(self.All))) to_keep = set(self.Alphabet) ^ set(self.Degenerates) - set(self.Gaps) self.stripBadAndGaps = FunctionWrapper(keep_chars(''.join(to_keep))) #make inverse degenerates from degenerates #ensure that lowercase versions also exist if appropriate inv_degens = {} for key, val in self.Degenerates.items(): inv_degens[frozenset(val)] = key.upper() if add_lower: inv_degens[frozenset(''.join(val).lower())] = key.lower() for m in self.Alphabet: inv_degens[frozenset(m)] = m if add_lower: inv_degens[frozenset(''.join(m).lower())] = m.lower() for m in self.Gaps: inv_degens[frozenset(m)] = m self.InverseDegenerates = inv_degens #set array type for modeling alphabets try: self.ArrayType = self.Alphabet.ArrayType except AttributeError: self.ArrayType = None #set modeling sequence self.ModelSeq = ModelSeq
#!/usr/bin/env python """Tests of the Enumeration and Alphabet objects. Note: individual Alphabets are typically in MolType and are tested there. """ from cogent.core.alphabet import Enumeration, get_array_type, \ uint8, uint16, uint32, array, JointEnumeration, CharAlphabet, \ _make_translation_tables, _make_complement_array from cogent.core.moltype import RNA from cogent.util.unit_test import TestCase, main DnaBases = CharAlphabet('TCAG') RnaBases = CharAlphabet('UCAG') AminoAcids = CharAlphabet('ACDEFGHIKLMNPQRSTVWY') __author__ = "Rob Knight, Peter Maxwell and Gavin Huttley" __copyright__ = "Copyright 2007-2009, The Cogent Project" __credits__ = ["Peter Maxwell", "Rob Knight", "Gavin Huttley"] __license__ = "GPL" __version__ = "1.4.1" __maintainer__ = "Rob Knight" __email__ = "*****@*****.**" __status__ = "Production" class translation_table_tests(TestCase): """Tests of top-level translation table functions""" def test_make_translation_tables(self): """_make_translation_tables should translate from chars to indices""" a = 'ucag' itoa, atoi = _make_translation_tables(a)
from cogent.core.info import Info as InfoClass from cogent.core.alphabet import CharAlphabet from string import upper from numpy import array, concatenate, sum, mean, isfinite, sqrt __author__ = "Rob Knight" __copyright__ = "Copyright 2007-2016, The Cogent Project" __credits__ = ["Rob Knight", "Sandra Smit", "Daniel McDonald"] __license__ = "GPL" __version__ = "1.9" __maintainer__ = "Rob Knight" __email__ = "*****@*****.**" __status__ = "Production" RnaBases = CharAlphabet('UCAG') DnaBases = CharAlphabet('TCAG') AminoAcids = CharAlphabet('ACDEFGHIKLMNPQRSTVWY*') # * denotes termination AB = CharAlphabet('ab') #used for testing Chars = CharAlphabet(''.join(map(chr, range(256))), '-') #used for raw chars RnaBasesGap = CharAlphabet('UCAG-', '-') DnaBasesGap = CharAlphabet('TCAG-', '-') AminoAcidsGap = CharAlphabet('ACDEFGHIKLMNPQRSTVWY*-', '-') DnaIupac = CharAlphabet('TCAGNVBHDKSWMYR') RnaIupac = CharAlphabet('UCAGNVBHDKSWMYR') AminoAcidsIupac = CharAlphabet('ACDEFGHIKLMNPQRSTVWY*XBZ') DnaIupacGap = CharAlphabet('TCAG-NVBHDKSWMYR', '-') RnaIupacGap = CharAlphabet('UCAG-NVBHDKSWMYR', '-') AminoAcidsIupacGap = CharAlphabet('ACDEFGHIKLMNPQRSTVWY*-XBZ', '-') RnaPairs = RnaBases**2