def read_transfac(cls, fin, alphabet=None): """Parse a TRANSFAC-format PWM from a file. Returns a Motif object, representing the provided PWM along with an inferred or provided alphabet. """ items = [] start = False for line in fin: if line.isspace() or line[0] == "#": continue # pragma: no cover stuff = line.split() if stuff[0] == "PO" or stuff[0] == "P0": start = True # 'XX' delimiters may precede the first motif if start: if stuff[0] in cls._TRANSFAC_DELIM_LINES: break else: items.append(stuff) if len(items) < 2: raise ValueError("Vacuous file.") # Is the first line a header line? header = items.pop(0) hcols = len(header) rows = len(items) cols = len(items[0]) if not (header[0] == "PO" or header[0] == "P0" or hcols == cols - 1 or hcols == cols - 2): raise ValueError("Missing header line!") # pragma: no cover # Do all lines (except the first) contain the same number of items? cols = len(items[0]) for i in range(1, len(items)): if cols != len(items[i]): raise ValueError("Inconsistant length, row: {}".format( i)) # pragma: no cover # Vertical or horizontal arrangement? if header[0] == "PO" or header[0] == "P0": header.pop(0) position_header = True for h in header: if not ischar(h): raise ValueError("Expected a single character per header " 'item, but got "{}" as one item'.format( h)) # pragma: no cover if not isint(h): position_header = False alphabet_header = False if position_header else True # Check row headers if alphabet_header: for i, r in enumerate(items): if not isint(r[0]) and r[0][0] != "P": raise ValueError("Expected position " "as first item on line {}".format( i)) # pragma: no cover r.pop(0) defacto_alphabet = "".join(header) else: a = [] # pragma: no cover for i, r in enumerate(items): # pragma: no cover if not ischar(r[0]) and r[0][0] != "P": # pragma: no cover raise ValueError("Expected position " # pragma: no cover "as first item on line {}".format( i)) # pragma: no cover a.append(r.pop(0)) # pragma: no cover defacto_alphabet = "".join(a) # pragma: no cover # Check defacto_alphabet defacto_alphabet = Alphabet(defacto_alphabet) if alphabet: alphabet = Alphabet(alphabet) if not defacto_alphabet.alphabetic(alphabet): # Allow alphabet to be a superset of defacto_alphabet alphabet = defacto_alphabet else: alphabets = ( unambiguous_rna_alphabet, unambiguous_dna_alphabet, unambiguous_protein_alphabet, ) for a in alphabets: if defacto_alphabet.alphabetic(a): alphabet = a break if not alphabet: alphabet = defacto_alphabet # pragma: no cover # The last item of each row may be extra cruft. Remove if len(items[0]) == len(header) + 1: for r in items: r.pop() # items should now be a list of lists of numbers (as strings) rows = len(items) cols = len(items[0]) matrix = np.zeros((rows, cols), dtype=np.float64) for r in range(rows): for c in range(cols): matrix[r, c] = float(items[r][c]) if position_header: matrix.transpose() # pragma: no cover return Motif(defacto_alphabet, matrix).reindex(alphabet)
def test_alphabet_alphabetic(self): a = Alphabet("alphbet") self.assertTrue(a.alphabetic("alphbet")) self.assertTrue(not a.alphabetic("alphbetX"))