def read_transfac( fin, alphabet = None) : """ Parse a sequence matrix from a file. Returns a tuple of (alphabet, matrix) """ items = [] start=True for line in fin : if line.isspace() or line[0] =='#' : continue stuff = line.split() if start and stuff[0] != 'PO' and stuff[0] != 'P0': continue if stuff[0]=='XX' or stuff[0]=='//': break start = False items.append(stuff) if len(items) < 2 : raise ValueError, "Vacuous file." # Is the first line a header line? header = items.pop(0) hcols = len(header) rows = len(items) cols = len(items[0]) if not( header[0] == 'PO' or header[0] =='P0' or hcols == cols-1 or hcols == cols-2) : raise ValueError, "Missing header line!" # Do all lines (except the first) contain the same number of items? cols = len(items[0]) for i in range(1, len(items)) : if cols != len(items[i]) : raise ValueError, "Inconsistant length, row %d: " % i # Vertical or horizontal arrangement? if header[0] == 'PO' or header[0] == 'P0': header.pop(0) position_header = True alphabet_header = True for h in header : if not isint(h) : position_header = False if not str.isalpha(h) : alphabet_header = False if not position_header and not alphabet_header : raise ValueError, "Can't parse header: %s" % str(header) if position_header and alphabet_header : raise ValueError, "Can't parse header" # Check row headers if alphabet_header : for i,r in enumerate(items) : if not isint(r[0]) : raise ValueError, "Expected position as first item on line %d", i r.pop(0) defacto_alphabet = ''.join(header) else : a = [] for i,r in enumerate(items) : if not ischar(r[0]) : raise ValueError, "Expected position as first item on line %d", i a.append(r.pop(0)) defacto_alphabet = ''.join(a) # Check defacto_alphabet defacto_alphabet = Alphabet(defacto_alphabet) if alphabet : if not defacto_alphabet.alphabetic(alphabet) : raise ValueError, "Incompatible alphabets: %s , %s (defacto)"% ( alphabet, defacto_alphabet) else : alphabets = (unambiguous_rna_alphabet, unambiguous_dna_alphabet, unambiguous_protein_alphabet, ) for a in alphabets : if defacto_alphabet.alphabetic(a) : alphabet = a break if not alphabet : alphabet = defacto_alphabet # The last item of each row may be extra cruft. Remove if len(items[0]) == len(header) +1 : for r in items : r.pop() # items should now be a list of lists of numbers (as strings) rows = len(items) cols = len(items[0]) matrix = na.zeros( (rows,cols) , dtype=na.float64) for r in range( rows) : for c in range(cols): matrix[r,c] = float( items[r][c]) if position_header : matrix.transpose() return Motif(defacto_alphabet, matrix).reindex(alphabet)
def read_transfac(cls, fin, alphabet=None): """ Parse a TRANSFAC-format PWM from a file. Returns a Motif object, representing the provided PWM along with an inferred or provided alphabet. """ items = [] start = False for line in fin: if line.isspace() or line[0] == '#': continue stuff = line.split() if stuff[0] == 'PO' or stuff[0] == 'P0': start = True # 'XX' delimiters may precede the first motif if start: if stuff[0] in cls._TRANSFAC_DELIM_LINES: break else: items.append(stuff) if len(items) < 2: raise ValueError("Vacuous file.") # Is the first line a header line? header = items.pop(0) hcols = len(header) rows = len(items) cols = len(items[0]) if not (header[0] == 'PO' or header[0] == 'P0' or hcols == cols - 1 or hcols == cols - 2): raise ValueError("Missing header line!") # Do all lines (except the first) contain the same number of items? cols = len(items[0]) for i in range(1, len(items)): if cols != len(items[i]): raise ValueError("Inconsistant length, row: {}".format(i)) # Vertical or horizontal arrangement? if header[0] == 'PO' or header[0] == 'P0': header.pop(0) position_header = True for h in header: if not ischar(h): raise ValueError("Expected a single character per header " "item, but got \"{}\" as one item".format(h)) if not isint(h): position_header = False alphabet_header = False if position_header else True # Check row headers if alphabet_header: for i, r in enumerate(items): if not isint(r[0]) and r[0][0] != 'P': raise ValueError("Expected position " "as first item on line {}".format(i)) r.pop(0) defacto_alphabet = ''.join(header) else: a = [] for i, r in enumerate(items): if not ischar(r[0]) and r[0][0] != 'P': raise ValueError("Expected position " "as first item on line {}".format(i)) a.append(r.pop(0)) defacto_alphabet = ''.join(a) # Check defacto_alphabet defacto_alphabet = Alphabet(defacto_alphabet) if alphabet: alphabet = Alphabet(alphabet) if not defacto_alphabet.alphabetic(alphabet): # Allow alphabet to be a superset of defacto_alphabet alphabet = defacto_alphabet else: alphabets = (unambiguous_rna_alphabet, unambiguous_dna_alphabet, unambiguous_protein_alphabet, ) for a in alphabets: if defacto_alphabet.alphabetic(a): alphabet = a break if not alphabet: alphabet = defacto_alphabet # The last item of each row may be extra cruft. Remove if len(items[0]) == len(header) + 1: for r in items: r.pop() # items should now be a list of lists of numbers (as strings) rows = len(items) cols = len(items[0]) matrix = na.zeros((rows, cols), dtype=na.float64) for r in range(rows): for c in range(cols): matrix[r, c] = float(items[r][c]) if position_header: matrix.transpose() return Motif(defacto_alphabet, matrix).reindex(alphabet)
def read_transfac(fin, alphabet=None): """ Parse a sequence matrix from a file. Returns a tuple of (alphabet, matrix) """ items = [] start = True for line in fin: if line.isspace() or line[0] == '#': continue stuff = line.split() if start and stuff[0] != 'PO' and stuff[0] != 'P0': continue if stuff[0] == 'XX' or stuff[0] == '//': break start = False items.append(stuff) if len(items) < 2: raise ValueError("Vacuous file.") # Is the first line a header line? header = items.pop(0) hcols = len(header) rows = len(items) cols = len(items[0]) if not (header[0] == 'PO' or header[0] == 'P0' or hcols == cols - 1 or hcols == cols - 2): raise ValueError("Missing header line!") # Do all lines (except the first) contain the same number of items? cols = len(items[0]) for i in range(1, len(items)): if cols != len(items[i]): raise ValueError("Inconsistant length, row %d: " % i) # Vertical or horizontal arrangement? if header[0] == 'PO' or header[0] == 'P0': header.pop(0) position_header = True alphabet_header = True for h in header: if not isint(h): position_header = False if not str.isalpha(h): alphabet_header = False if not position_header and not alphabet_header: raise ValueError("Can't parse header: %s" % str(header)) if position_header and alphabet_header: raise ValueError("Can't parse header") # Check row headers if alphabet_header: for i, r in enumerate(items): if not isint(r[0]) and r[0][0] != 'P': raise ValueError( "Expected position as first item on line %d" % i) r.pop(0) defacto_alphabet = ''.join(header) else: a = [] for i, r in enumerate(items): if not ischar(r[0]) and r[0][0] != 'P': raise ValueError( "Expected position as first item on line %d" % i) a.append(r.pop(0)) defacto_alphabet = ''.join(a) # Check defacto_alphabet defacto_alphabet = Alphabet(defacto_alphabet) if alphabet: if not defacto_alphabet.alphabetic(alphabet): raise ValueError("Incompatible alphabets: %s , %s (defacto)" % (alphabet, defacto_alphabet)) else: alphabets = ( unambiguous_rna_alphabet, unambiguous_dna_alphabet, unambiguous_protein_alphabet, ) for a in alphabets: if defacto_alphabet.alphabetic(a): alphabet = a break if not alphabet: alphabet = defacto_alphabet # The last item of each row may be extra cruft. Remove if len(items[0]) == len(header) + 1: for r in items: r.pop() # items should now be a list of lists of numbers (as strings) rows = len(items) cols = len(items[0]) matrix = na.zeros((rows, cols), dtype=na.float64) for r in range(rows): for c in range(cols): matrix[r, c] = float(items[r][c]) if position_header: matrix.transpose() return Motif(defacto_alphabet, matrix).reindex(alphabet)
def read_swissRegulon( fin, alphabet=None ): """ """ import re items = [] start=True for line in fin : if line.isspace() or line[0] =='#' or re.search('^//$', line): continue stuff = line.split() if start and stuff[0] != 'P0' and stuff[0] != 'PO' : continue start = False items.append(stuff[0:5]) if len(items) < 2 : raise ValueError("Vacuous file.") # Is the first line a header line? header = items.pop(0)[0:5] hcols = len(header) rows = len(items) cols = len(items[0]) if not( header[0] =='P0' or header[0] =='PO' or hcols == cols-1 or hcols == cols-2) : raise ValueError("Missing header line!") # Do all lines (except the first) contain the same number of items? cols = len(items[0]) for i in range(1, len(items)) : if cols != len(items[i]) : raise ValueError("Inconsistant length, row %d: " % i) # Vertical or horizontal arrangement? if header[0] == 'PO' or header[0] == 'P0': header.pop(0) position_header = True alphabet_header = True for h in header : if not isint(h) : position_header = False if not str.isalpha(h) : alphabet_header = False if not position_header and not alphabet_header : raise ValueError("Can't parse header: %s" % str(header)) if position_header and alphabet_header : raise ValueError("Can't parse header") # Check row headers if alphabet_header : for i,r in enumerate(items) : if not isint(r[0]) and r[0][0]!='P' : raise ValueError( "Expected position as first item on line %d" % i) r.pop(0) defacto_alphabet = ''.join(header) # Check defacto_alphabet defacto_alphabet = Alphabet(defacto_alphabet) if alphabet : if not defacto_alphabet.alphabetic(alphabet) : raise ValueError("Incompatible alphabets: %s , %s (defacto)" % (alphabet, defacto_alphabet)) else : alphabets = (unambiguous_rna_alphabet, unambiguous_dna_alphabet, unambiguous_protein_alphabet, ) for a in alphabets : if defacto_alphabet.alphabetic(a) : alphabet = a break if not alphabet : alphabet = defacto_alphabet # items should now be a list of lists of numbers (as strings) rows = len(items) cols = len(items[0]) matrix = na.zeros( (rows,cols) , dtype=na.float64) for r in range( rows) : for c in range(cols): matrix[r,c] = float( items[r][c]) if position_header : matrix.transpose() return Motif(defacto_alphabet, matrix).reindex(alphabet)