def test_alphabet_chr(self): a = generic_alphabet for i, c in enumerate(a): self.assertEqual(ord(a.chr(i)), i + 32) a = Alphabet("alph") self.assertEqual("h", a.chr(3))
def test_normalize(self): a = Alphabet("ABCDE") s = 'aBbc' n = a.normalize(s) self.assertEqual(str(n), 'ABBC') self.assertRaises(ValueError, a.normalize, 'aslkfdnnr33')
def test_alphabet_ord(self): a = generic_alphabet for i, c in enumerate(a): self.assertEqual(a.ord(c), i) a = Alphabet("alph") self.assertEqual(2, a.ord("p"))
def test_normalize(self): a = Alphabet("ABCDE") s = "aBbc" n = a.normalize(s) self.assertEqual(str(n), "ABBC") self.assertRaises(ValueError, a.normalize, "aslkfdnnr33")
def test_alphabet_ords(self): a = Alphabet("alph") self.assertEqual(0, a.ords("alphalph")[4]) a = generic_alphabet o = a.ords(a) for i, c in enumerate(o): self.assertEqual(c, i)
def test_create_from_alphabet(self): """ If we pass an alphabet to the constuctor, it's passed right back """ a1 = Alphabet("kjdahf") a2 = Alphabet(a1) self.assertTrue(a1 == a2) self.assertFalse(a1 == "not an alphabet")
def test_isaligned(self): a = Alphabet("ABCD") s0 = Seq("ABCDD", a) s1 = Seq("AAAAD", a) s2 = Seq("AAABD", a) s3 = Seq("AAACD", a) seqs = SeqList([s0, s1, s2, s3], a) assert seqs.isaligned() seqs = SeqList([s0, s1, s2, s3], Alphabet("ABCDE")) assert not seqs.isaligned()
def test_get_subMatrix(self): ab = Alphabet('ABCD') ar = np.asarray([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]) mat = SubMatrix(ab, ar) mat2 = mat.reindex('ABC') assert np.all( mat2.array == np.asarray([[1, 2, 3], [5, 6, 7], [9, 10, 11]])) mat2 = mat.reindex('BA') assert np.all(mat2.array == np.asarray([[6, 5], [2, 1]])) mat2 = mat.reindex(Alphabet('BA')) assert np.all(mat2.array == np.asarray([[6, 5], [2, 1]]))
def test_profile(self): a = Alphabet("ABCD") s0 = Seq("ABCDD", a) s1 = Seq("AAAAD", a) s2 = Seq("AAABD", a) s3 = Seq("AAACD", a) seqs = SeqList([s0, s1, s2, s3], a) tally = seqs.profile() self.assertEqual(list(tally[0]), [4, 0, 0, 0]) self.assertEqual(list(tally[1]), [3, 1, 0, 0]) self.assertEqual(list(tally[2]), [3, 0, 1, 0]) self.assertEqual(list(tally[3]), [1, 1, 1, 1]) self.assertEqual(list(tally[4]), [0, 0, 0, 4]) self.assertEqual(tally[4, 'D'], 4) seqs = SeqList([Seq("AAACD", a), Seq("AAACDA", a)], a) self.assertRaises(ValueError, seqs.profile) seqs = SeqList([Seq("AAACD", a), Seq("AAACD", a)]) self.assertRaises(ValueError, seqs.profile)
def test_repr(self): ab = Alphabet('ABCD') ar = np.asarray([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]) s = SubMatrix(ab, ar) repr(s)
def __init__(self, alphabets, values=None, dtype=None): """ Args: - alphabets -- a list of alphabets (as string or Alphabet objects) to be used to convert strings into indices. The lengths of the alphabets match the shape of the indexed array. Alternatively, an integer or None in the list indicate a non-alphabetic dimension. If None the dimension length is taken from values argument. - values -- An array of values to be indexed. If None a new array is created. If this argument is not a numpy array then the alphabet list must be explicit (cannot contain None.) - dtype -- An optional numpy type code. """ # A dummy object to be used in place of None in the alphabets list # so that we get meaningful error messages if we try to index a # nonalphabetic dimension with a string. class NullAlphabet(object): def ord(self, key): raise IndexError("This dimension does not have an alphabet" ) # pragma: no cover def ords(self, key): raise IndexError("This dimension does not have an alphabet" ) # pragma: no cover alpha = [] shape = [] for a in alphabets: if isinstance(a, str): a = Alphabet(a) if a is None: shape.append(None) alpha.append(NullAlphabet()) elif isinstance(a, Alphabet): shape.append(len(a)) alpha.append(a) else: shape.append(int(a)) # pragma: no cover alpha.append(None) # pragma: no cover shape = tuple(shape) if values is None: values = np.zeros(shape=shape, dtype=dtype) else: values = np.asarray(values, dtype=dtype) vshape = values.shape if len(shape) != len(vshape): raise ValueError( "The values array is the wrong shape.") # pragma: no cover for s1, s2 in zip(shape, vshape): if s1 is not None and s1 != s2: raise ValueError("The values array is the wrong shape." ) # pragma: no cover self.array = values self.alphabets = tuple(alpha)
def test_create_alphabet(self): # Alphabet contains repeated character self.assertRaises(ValueError, Alphabet, "alphabet") # Alphabet contains null character self.assertRaises(ValueError, Alphabet, "alph\x00") Alphabet("alphbet")
def test_parse_prior_equiprobable(self): self.assertTrue( all(20. * equiprobable_distribution(20) == parse_prior( 'equiprobable', unambiguous_protein_alphabet, weight=20.))) self.assertTrue( all(1.2 * equiprobable_distribution(3) == parse_prior( ' equiprobablE ', Alphabet('123'), 1.2)))
def test_parse_prior_equiprobable(self): self.assertTrue( all(20.0 * equiprobable_distribution(20) == parse_prior( "equiprobable", unambiguous_protein_alphabet, weight=20.0))) self.assertTrue( all(1.2 * equiprobable_distribution(3) == parse_prior( " equiprobablE ", Alphabet("123"), 1.2)))
def complement(self): """Complement nucleic acid sequence.""" from weblogo.seq import Seq, Alphabet alphabet = self.alphabet complement_alphabet = Alphabet(Seq(alphabet, alphabet).complement()) self.alphabets = (None, complement_alphabet) m = self.reindex(alphabet) self.alphabets = (None, alphabet) self.array = m.array
def test_fail_get(self): ab = Alphabet('ABCD') ar = np.asarray([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]) s = SubMatrix(ab, ar) self.assertRaises(IndexError, s.__getitem__, ('E', 'A')) self.assertRaises(IndexError, s.__getitem__, ('5', '6')) # FIXME self.assertRaises(IndexError, s.index, ('E', 'A'))
def test_fail_get(self): ab = Alphabet("ABCD") ar = np.asarray([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]) s = SubMatrix(ab, ar) self.assertRaises(IndexError, s.__getitem__, ("E", "A")) self.assertRaises(IndexError, s.__getitem__, ("5", "6")) # FIXME self.assertRaises(IndexError, s.index, ("E", "A"))
def test_get(self): ab = Alphabet('ABCD') ar = np.asarray([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]) s = SubMatrix(ab, ar) s1 = 'DCCBBBAAA' s2 = 'BA' v = s.index((s1, s2)) # print v for m, i in enumerate(s1): for n, j in enumerate(s2): assert s[i, j] == v[m, n]
def __init__(self, alphabet, array=None, typeof=None, name=None, description=None, scale=None): AlphabeticArray.__init__(self, (alphabet, alphabet), array, typeof) self.alphabet = Alphabet(alphabet) self.name = name self.description = description self.scale = scale
def test_read_alphabets(self): # incompatable alphabets f = StringIO(test_matrix3) self.assertRaises(ValueError, SubMatrix.read, f) f = StringIO(test_matrix3) SubMatrix.read(f, alphabet=Alphabet('ARNDCQEGHILKMFPSTWYV')) f2 = StringIO(test_matrix1) self.assertRaises(ValueError, SubMatrix.read, f2, unambiguous_protein_alphabet)
def test_which_alphabet(self): a = Alphabet.which(Seq("ARNDCQEGHILKMFPSTWYVX")) assert a == unambiguous_protein_alphabet f1 = data_stream('cap.fa') f2 = data_stream('cox2.msf') f3 = data_stream('Rv3829c.fasta') f4 = data_stream('chain_B.fasta') tests = ( (seq_io.read(f1), unambiguous_dna_alphabet), (seq_io.read(f2), unambiguous_protein_alphabet), (seq_io.read(f3), unambiguous_protein_alphabet), (seq_io.read(f4), unambiguous_protein_alphabet), ) for t in tests: self.assertEqual(Alphabet.which(t[0]), t[1]) f1.close() f2.close() f3.close() f4.close()
def test_ords(self): s0 = Seq("ACGTURYBDHVNACGTURYSWKMBDHVN", nucleic_alphabet) s1 = Seq("ACGTURYSDHVNACGTURYSWKMBDHVN", nucleic_alphabet) s2 = Seq("ACGTURSWKMBDHVNACGTURKMBDHVN", nucleic_alphabet) seqs = SeqList([s0, s1, s2], nucleic_alphabet) seqs.ords() # self.assertEqual( a.shape, (3, 28) ) # Fails if seqs are of different lengths # FIXME? # s3 = Seq("ACGTUR", nucleic_alphabet ) # seqs2 = SeqList( [ s0,s1,s3,s2], nucleic_alphabet) # self.assertRaises(ValueError, seqs2.ords ) # Use a different alphabet seqs.ords(nucleic_alphabet) # No alphabet seqs3 = SeqList([s0, s1, s2]) seqs3.ords(alphabet=Alphabet("ABC")) # Fail if no alphabet self.assertRaises(ValueError, seqs3.ords)
def test_alphabet_alphabetic(self): a = Alphabet("alphbet") self.assertTrue(a.alphabetic("alphbet")) self.assertTrue(not a.alphabetic("alphbetX"))
def read_transfac(cls, fin, alphabet=None): """Parse a TRANSFAC-format PWM from a file. Returns a Motif object, representing the provided PWM along with an inferred or provided alphabet. """ items = [] start = False for line in fin: if line.isspace() or line[0] == "#": continue # pragma: no cover stuff = line.split() if stuff[0] == "PO" or stuff[0] == "P0": start = True # 'XX' delimiters may precede the first motif if start: if stuff[0] in cls._TRANSFAC_DELIM_LINES: break else: items.append(stuff) if len(items) < 2: raise ValueError("Vacuous file.") # Is the first line a header line? header = items.pop(0) hcols = len(header) rows = len(items) cols = len(items[0]) if not (header[0] == "PO" or header[0] == "P0" or hcols == cols - 1 or hcols == cols - 2): raise ValueError("Missing header line!") # pragma: no cover # Do all lines (except the first) contain the same number of items? cols = len(items[0]) for i in range(1, len(items)): if cols != len(items[i]): raise ValueError("Inconsistant length, row: {}".format( i)) # pragma: no cover # Vertical or horizontal arrangement? if header[0] == "PO" or header[0] == "P0": header.pop(0) position_header = True for h in header: if not ischar(h): raise ValueError("Expected a single character per header " 'item, but got "{}" as one item'.format( h)) # pragma: no cover if not isint(h): position_header = False alphabet_header = False if position_header else True # Check row headers if alphabet_header: for i, r in enumerate(items): if not isint(r[0]) and r[0][0] != "P": raise ValueError("Expected position " "as first item on line {}".format( i)) # pragma: no cover r.pop(0) defacto_alphabet = "".join(header) else: a = [] # pragma: no cover for i, r in enumerate(items): # pragma: no cover if not ischar(r[0]) and r[0][0] != "P": # pragma: no cover raise ValueError("Expected position " # pragma: no cover "as first item on line {}".format( i)) # pragma: no cover a.append(r.pop(0)) # pragma: no cover defacto_alphabet = "".join(a) # pragma: no cover # Check defacto_alphabet defacto_alphabet = Alphabet(defacto_alphabet) if alphabet: alphabet = Alphabet(alphabet) if not defacto_alphabet.alphabetic(alphabet): # Allow alphabet to be a superset of defacto_alphabet alphabet = defacto_alphabet else: alphabets = ( unambiguous_rna_alphabet, unambiguous_dna_alphabet, unambiguous_protein_alphabet, ) for a in alphabets: if defacto_alphabet.alphabetic(a): alphabet = a break if not alphabet: alphabet = defacto_alphabet # pragma: no cover # The last item of each row may be extra cruft. Remove if len(items[0]) == len(header) + 1: for r in items: r.pop() # items should now be a list of lists of numbers (as strings) rows = len(items) cols = len(items[0]) matrix = np.zeros((rows, cols), dtype=np.float64) for r in range(rows): for c in range(cols): matrix[r, c] = float(items[r][c]) if position_header: matrix.transpose() # pragma: no cover return Motif(defacto_alphabet, matrix).reindex(alphabet)
try: return object.__getattr__(self, name) except AttributeError: return getattr(self.array, name) def __setattr__(self, name, value): try: return object.__setattr__(self, name, value) except AttributeError: # pragma: no cover return setattr(self.array, name, value) # pragma: no cover # End class AlphabeticArray # TODO: move to seq? submatrix_alphabet = Alphabet("ARNDCQEGHILKMFPSTWYVBZX") class SubMatrix(AlphabeticArray): """A two dimensional array indexed by an Alphabet. Used to hold substitution matrices and similar information. Various standard substitution matrices are available from the data package >>> from weblogo import data >>> mat = SubMatrix.read(data.data_stream('blosum100')) Attr: - alphabet -- An Alphabet - array -- A numpy array - name -- The name of this matrix (if any) as a string. - description -- The description, if any.
def test_ords(self): a = Alphabet("ABC") s = Seq("ABCCBA", a) self.assertEqual(list(s.ords()), [0, 1, 2, 2, 1, 0])
def test_repr(self): a = Alphabet("kjdahf") repr(a) str(a)
def test_tally_nonalphabetic(self): s = Seq("AGTCAGCTACGACGCGC", dna_alphabet) c = s.tally(Alphabet("AC")) self.assertEqual(2, len(c)) self.assertEqual(list(c), [4, 6])
def test_none(self): a1 = Alphabet(None) self.assertEqual(a1, generic_alphabet)
def test_alphabet_chrs(self): a = Alphabet("alph") self.assertEqual(Seq("ppla", a), a.chrs((2, 2, 1, 0)))