示例#1
0
文件: matrix.py 项目: jnktsj/heatlogo
    def read_transfac( fin, alphabet = None) :
        """ Parse a sequence matrix from a file. 
        Returns a tuple of (alphabet, matrix)
        """
   
        items = []

        start=True
        for line in fin :
            if line.isspace() or line[0] =='#' : continue
            stuff = line.split()
            if start and stuff[0] != 'PO' and stuff[0] != 'P0': continue
            if stuff[0]=='XX' or stuff[0]=='//': break
            start = False
            items.append(stuff)
        if len(items) < 2  :
            raise ValueError, "Vacuous file."

        # Is the first line a header line?
        header = items.pop(0)
        hcols = len(header)
        rows = len(items)
        cols = len(items[0])
        if not( header[0] == 'PO' or header[0] =='P0' or hcols == cols-1 or hcols == cols-2) :
            raise ValueError, "Missing header line!"

        # Do all lines (except the first) contain the same number of items?
        cols = len(items[0])
        for i in range(1, len(items)) :
            if cols != len(items[i]) :
                raise ValueError, "Inconsistant length, row %d: " % i

        # Vertical or horizontal arrangement?
        if header[0] == 'PO' or header[0] == 'P0': header.pop(0)

        position_header = True    
        alphabet_header = True    
        for h in header :
            if not isint(h) : position_header = False
            if not str.isalpha(h) : alphabet_header = False

        if not position_header and not alphabet_header :
            raise ValueError, "Can't parse header: %s" % str(header)

        if position_header and alphabet_header :
            raise ValueError, "Can't parse header"        


        # Check row headers
        if alphabet_header :
            for i,r in enumerate(items) :
                if not isint(r[0]) : 
                    raise ValueError, "Expected position as first item on line %d", i
                r.pop(0)
                defacto_alphabet = ''.join(header)
        else :
            a = []
            for i,r in enumerate(items) :
                if not ischar(r[0]) : 
                    raise ValueError, "Expected position as first item on line %d", i
                a.append(r.pop(0))
            defacto_alphabet = ''.join(a)                

        # Check defacto_alphabet
        defacto_alphabet = Alphabet(defacto_alphabet)

        if alphabet :
            if not defacto_alphabet.alphabetic(alphabet) :
                raise ValueError, "Incompatible alphabets: %s , %s (defacto)"% (
                    alphabet, defacto_alphabet)
        else :            
            alphabets = (unambiguous_rna_alphabet,
                        unambiguous_dna_alphabet,                      
                        unambiguous_protein_alphabet,
                      )
            for a in alphabets :
                if defacto_alphabet.alphabetic(a) :
                    alphabet = a
                    break
            if not alphabet :
                alphabet = defacto_alphabet
   

        # The last item of each row may be extra cruft. Remove
        if len(items[0]) == len(header) +1 :
            for r in items :
                r.pop()

        # items should now be a list of lists of numbers (as strings) 
        rows = len(items)
        cols = len(items[0])
        matrix = na.zeros( (rows,cols) , dtype=na.float64) 
        for r in range( rows) :
            for c in range(cols):
                matrix[r,c] = float( items[r][c]) 

        if position_header :
            matrix.transpose() 

        return Motif(defacto_alphabet, matrix).reindex(alphabet)
示例#2
0
    def read_transfac(cls, fin, alphabet=None):
        """ Parse a TRANSFAC-format PWM from a file.
        Returns a Motif object, representing the provided
        PWM along with an inferred or provided alphabet.
        """

        items = []

        start = False
        for line in fin:
            if line.isspace() or line[0] == '#':
                continue

            stuff = line.split()

            if stuff[0] == 'PO' or stuff[0] == 'P0':
                start = True

            # 'XX' delimiters may precede the first motif
            if start:
                if stuff[0] in cls._TRANSFAC_DELIM_LINES:
                    break
                else:
                    items.append(stuff)

        if len(items) < 2:
            raise ValueError("Vacuous file.")

        # Is the first line a header line?
        header = items.pop(0)
        hcols = len(header)
        rows = len(items)
        cols = len(items[0])
        if not (header[0] == 'PO' or header[0] == 'P0' or
                hcols == cols - 1 or hcols == cols - 2):
            raise ValueError("Missing header line!")

        # Do all lines (except the first) contain the same number of items?
        cols = len(items[0])
        for i in range(1, len(items)):
            if cols != len(items[i]):
                raise ValueError("Inconsistant length, row: {}".format(i))

        # Vertical or horizontal arrangement?
        if header[0] == 'PO' or header[0] == 'P0':
            header.pop(0)

        position_header = True

        for h in header:
            if not ischar(h):
                raise ValueError("Expected a single character per header "
                                 "item, but got \"{}\" as one item".format(h))
            if not isint(h):
                position_header = False

        alphabet_header = False if position_header else True

        # Check row headers
        if alphabet_header:
            for i, r in enumerate(items):
                if not isint(r[0]) and r[0][0] != 'P':
                    raise ValueError("Expected position "
                                     "as first item on line {}".format(i))
                r.pop(0)
                defacto_alphabet = ''.join(header)
        else:
            a = []
            for i, r in enumerate(items):
                if not ischar(r[0]) and r[0][0] != 'P':
                    raise ValueError("Expected position "
                                     "as first item on line {}".format(i))
                a.append(r.pop(0))
            defacto_alphabet = ''.join(a)

        # Check defacto_alphabet
        defacto_alphabet = Alphabet(defacto_alphabet)

        if alphabet:
            alphabet = Alphabet(alphabet)
            if not defacto_alphabet.alphabetic(alphabet):
                # Allow alphabet to be a superset of defacto_alphabet
                alphabet = defacto_alphabet

        else:
            alphabets = (unambiguous_rna_alphabet,
                         unambiguous_dna_alphabet,
                         unambiguous_protein_alphabet,
                         )
            for a in alphabets:
                if defacto_alphabet.alphabetic(a):
                    alphabet = a
                    break
            if not alphabet:
                alphabet = defacto_alphabet

        # The last item of each row may be extra cruft. Remove
        if len(items[0]) == len(header) + 1:
            for r in items:
                r.pop()

        # items should now be a list of lists of numbers (as strings)
        rows = len(items)
        cols = len(items[0])
        matrix = na.zeros((rows, cols), dtype=na.float64)
        for r in range(rows):
            for c in range(cols):
                matrix[r, c] = float(items[r][c])

        if position_header:
            matrix.transpose()

        return Motif(defacto_alphabet, matrix).reindex(alphabet)
示例#3
0
    def read_transfac(fin, alphabet=None):
        """ Parse a sequence matrix from a file. 
        Returns a tuple of (alphabet, matrix)
        """

        items = []

        start = True
        for line in fin:
            if line.isspace() or line[0] == '#':
                continue
            stuff = line.split()
            if start and stuff[0] != 'PO' and stuff[0] != 'P0':
                continue
            if stuff[0] == 'XX' or stuff[0] == '//':
                break
            start = False
            items.append(stuff)
        if len(items) < 2:
            raise ValueError("Vacuous file.")

        # Is the first line a header line?
        header = items.pop(0)
        hcols = len(header)
        rows = len(items)
        cols = len(items[0])
        if not (header[0] == 'PO' or header[0] == 'P0' or hcols == cols - 1
                or hcols == cols - 2):
            raise ValueError("Missing header line!")

        # Do all lines (except the first) contain the same number of items?
        cols = len(items[0])
        for i in range(1, len(items)):
            if cols != len(items[i]):
                raise ValueError("Inconsistant length, row %d: " % i)

        # Vertical or horizontal arrangement?
        if header[0] == 'PO' or header[0] == 'P0':
            header.pop(0)

        position_header = True
        alphabet_header = True
        for h in header:
            if not isint(h):
                position_header = False
            if not str.isalpha(h):
                alphabet_header = False

        if not position_header and not alphabet_header:
            raise ValueError("Can't parse header: %s" % str(header))

        if position_header and alphabet_header:
            raise ValueError("Can't parse header")

        # Check row headers
        if alphabet_header:
            for i, r in enumerate(items):
                if not isint(r[0]) and r[0][0] != 'P':
                    raise ValueError(
                        "Expected position as first item on line %d" % i)
                r.pop(0)
                defacto_alphabet = ''.join(header)
        else:
            a = []
            for i, r in enumerate(items):
                if not ischar(r[0]) and r[0][0] != 'P':
                    raise ValueError(
                        "Expected position as first item on line %d" % i)
                a.append(r.pop(0))
            defacto_alphabet = ''.join(a)

            # Check defacto_alphabet
        defacto_alphabet = Alphabet(defacto_alphabet)

        if alphabet:
            if not defacto_alphabet.alphabetic(alphabet):
                raise ValueError("Incompatible alphabets: %s , %s (defacto)" %
                                 (alphabet, defacto_alphabet))
        else:
            alphabets = (
                unambiguous_rna_alphabet,
                unambiguous_dna_alphabet,
                unambiguous_protein_alphabet,
            )
            for a in alphabets:
                if defacto_alphabet.alphabetic(a):
                    alphabet = a
                    break
            if not alphabet:
                alphabet = defacto_alphabet

        # The last item of each row may be extra cruft. Remove
        if len(items[0]) == len(header) + 1:
            for r in items:
                r.pop()

        # items should now be a list of lists of numbers (as strings)
        rows = len(items)
        cols = len(items[0])
        matrix = na.zeros((rows, cols), dtype=na.float64)
        for r in range(rows):
            for c in range(cols):
                matrix[r, c] = float(items[r][c])

        if position_header:
            matrix.transpose()

        return Motif(defacto_alphabet, matrix).reindex(alphabet)
示例#4
0
    def read_swissRegulon( fin, alphabet=None ):        
        """
        """
        import re 
        items = []
        start=True
        for line in fin :
            if line.isspace() or line[0] =='#' or re.search('^//$', line): continue
            stuff = line.split()
            if start and stuff[0] != 'P0' and stuff[0] != 'PO' : continue
            start = False
            items.append(stuff[0:5])
        if len(items) < 2  :
            raise ValueError("Vacuous file.")

        # Is the first line a header line?
        header = items.pop(0)[0:5]
        hcols = len(header)
        rows = len(items)
        cols = len(items[0])
        if not( header[0] =='P0' or header[0] =='PO' or hcols == cols-1 or hcols == cols-2) :
            raise ValueError("Missing header line!")

        # Do all lines (except the first) contain the same number of items?
        cols = len(items[0])
        for i in range(1, len(items)) :
            if cols != len(items[i]) :
                raise ValueError("Inconsistant length, row %d: " % i)
        
        # Vertical or horizontal arrangement?
        if header[0] == 'PO' or header[0] == 'P0': header.pop(0)

        position_header = True    
        alphabet_header = True    
        for h in header :
            if not isint(h) : position_header = False
            if not str.isalpha(h) : alphabet_header = False

        if not position_header and not alphabet_header :
            raise ValueError("Can't parse header: %s" % str(header))

        if position_header and alphabet_header :
            raise ValueError("Can't parse header")
        
        # Check row headers 
        if alphabet_header :
            for i,r in enumerate(items) :
                if not isint(r[0]) and r[0][0]!='P' : 
                    raise ValueError(
                        "Expected position as first item on line %d" % i)
                r.pop(0)
                defacto_alphabet = ''.join(header)
        # Check defacto_alphabet
        defacto_alphabet = Alphabet(defacto_alphabet)
        if alphabet :
            if not defacto_alphabet.alphabetic(alphabet) :
                raise ValueError("Incompatible alphabets: %s , %s (defacto)"
                                 % (alphabet, defacto_alphabet))
        else :            
            alphabets = (unambiguous_rna_alphabet,
                        unambiguous_dna_alphabet,                      
                        unambiguous_protein_alphabet,
                      )
            for a in alphabets :
                if defacto_alphabet.alphabetic(a) :
                    alphabet = a
                    break
            if not alphabet :
                alphabet = defacto_alphabet
   
        
        # items should now be a list of lists of numbers (as strings) 
        rows = len(items)
        cols = len(items[0])
        matrix = na.zeros( (rows,cols) , dtype=na.float64) 
        for r in range( rows) :
            for c in range(cols):
                matrix[r,c] = float( items[r][c]) 

        if position_header :
            matrix.transpose()
            
        return Motif(defacto_alphabet, matrix).reindex(alphabet)