示例#1
0
def match(args):
    sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)])
    db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)])

    mc = MotifComparer()
    result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean")

    print "Motif\tMatch\tScore\tP-value"
    for motif, match in result.items():
        pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True)
        print "%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval)

    if args.img:
        plotdata = []
        for query, match in result.items():
            motif = sample[query]
            dbmotif = db[match[0]]
            pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True)
            
            if orient == -1:
                tmp = dbmotif.id
                dbmotif = dbmotif.rc()
                dbmotif.id = tmp

            if pos < 0:
                tmp = motif.id
                motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm)
                motif.id = tmp
            elif pos > 0:
                tmp = dbmotif.id
                dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm)
                dbmotif.id = tmp

            plotdata.append((motif, dbmotif, pval))
            match_plot(plotdata, args.img)
示例#2
0
def match(args):
    sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)])
    db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)])

    mc = MotifComparer()
    result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean")

    print("Motif\tMatch\tScore\tP-value")
    for motif, match in result.items():
        pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True)
        print("%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval))

    if args.img:
        plotdata = []
        for query, match in result.items():
            motif = sample[query]
            dbmotif = db[match[0]]
            pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True)
            
            if orient == -1:
                tmp = dbmotif.id
                dbmotif = dbmotif.rc()
                dbmotif.id = tmp

            if pos < 0:
                tmp = motif.id
                motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm)
                motif.id = tmp
            elif pos > 0:
                tmp = dbmotif.id
                dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm)
                dbmotif.id = tmp

            plotdata.append((motif, dbmotif, pval))
            match_plot(plotdata, args.img)
示例#3
0
    def parse(self, fo):
        motifs = []

        p = re.compile(
            r'^\d+\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)')
        pwm = []
        motif_id = ""
        for line in fo.readlines():
            if line.startswith("Motif #"):
                if pwm:
                    m = Motif(pwm)
                    m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
                    motifs.append(m)
                motif_id = line.split("#")[1].split(":")[0]
                pwm = []
            else:
                m = p.search(line)
                if m:
                    pwm.append(
                        [float(m.group(x)) / 100.0 for x in range(1, 5)])

        if pwm:
            m = Motif(pwm)
            m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
            motifs.append(m)
        return motifs
示例#4
0
    def test_motif_export_import(self):
        pfm = [
            [120, 0, 0, 0],
            [120, 0, 0, 0],
            [0, 60, 60, 0],
            [0, 0, 0, 120],
            [0, 0, 0, 120],
        ]
        motif = Motif(pfm)
        motif.id = "test_motif"

        f = StringIO(motif.to_transfac())
        motif_from_file = read_motifs(f, fmt="transfac")[0]
        self.assertEqual("AASTT", motif_from_file.to_consensus().upper())
        self.assertEqual("test_motif", motif_from_file.id)

        f = StringIO(motif.to_meme())
        motif_from_file = read_motifs(f, fmt="meme")[0]
        self.assertEqual("AASTT", motif_from_file.to_consensus().upper())
        self.assertEqual("test_motif", motif_from_file.id)

        f = StringIO(motif.to_motevo())
        motif_from_file = read_motifs(f, fmt="transfac")[0]
        self.assertEqual("AASTT", motif_from_file.to_consensus().upper())
        self.assertEqual("test_motif", motif_from_file.id)
示例#5
0
    def parse(self, fo):
        """
        Convert ChIPMunk output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing ChIPMunk output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        line = fo.readline()
        if not line:
            return []

        while not line.startswith("A|"):
            line = fo.readline()
        matrix = []
        for _ in range(4):
            matrix.append(
                [float(x) for x in line.strip().split("|")[1].split(" ")])
            line = fo.readline()
        # print matrix
        matrix = [[matrix[x][y] for x in range(4)]
                  for y in range(len(matrix[0]))]
        # print matrix
        m = Motif(matrix)
        m.id = "ChIPMunk_w%s" % len(m)
        return [m]
示例#6
0
    def parse_out(self, fo):
        motifs = []
        nucs = {"A":0,"C":1,"G":2,"T":3}
        pseudo = 0.0 # Should be 1/sqrt(# of seqs)
        aligns = {}
        for line in fo.readlines():
            if line.startswith("#"):
                pass
            elif len(line) > 1:
                vals = line.strip().split("\t")
                id, site = [x.strip().split(" ")[1].replace('"',"") for x in vals[8].split(";") if x]
                #if vals[6] == "+":
                if site.upper().find("N") == -1:
                    aligns.setdefault(id, []).append(site)
                #else:
                #    print site, rc(site)
                #    aligns.setdefault(id, []).append(rc(site))
                        
        for id, align in aligns.items():
            #print id, len(align)

            width = len(align[0])
            pfm =  [[0 for x in range(4)] for x in range(width)]
            for row in align:
                for i in range(len(row)):
                    pfm[i][nucs[row[i]]] += 1
            total = float(len(align))
            pwm = [[(x + pseudo/4)/total+(pseudo) for x in row] for row in pfm]
            m = Motif()
            m.align = align[:]
            m.pwm = pwm[:]
            m.pfm = pfm[:]
            m.id = id
            motifs.append(m)    
        return motifs
示例#7
0
def match(args):
    sample = dict([(m.id, m) for m in read_motifs(args.pfmfile)])
    db = dict([(m.id, m) for m in read_motifs(args.dbpfmfile)])

    mc = MotifComparer()
    result = mc.get_best_matches(
        sample.values(), args.nmatches, db.values(), "partial", "seqcor", "mean"
    )

    plotdata = []
    print("Motif\tMatch\tScore\tP-value")
    for motif_name, matches in result.items():
        for match in matches:

            pval, pos, orient = mc.compare_motifs(
                sample[motif_name], db[match[0]], "partial", "seqcor", "mean", pval=True
            )
            print("%s\t%s\t%0.2f\t%0.3e" % (motif_name, match[0], match[1][0], pval))
            motif = sample[motif_name]
            dbmotif = db[match[0]]

            if args.img:
                if orient == -1:
                    tmp = dbmotif.id
                    dbmotif = dbmotif.rc()
                    dbmotif.id = tmp
                if pos < 0:
                    tmp = motif.id
                    motif = Motif([[0.25, 0.25, 0.25, 0.25]] * -pos + motif.pwm)
                    motif.id = tmp
                elif pos > 0:
                    tmp = dbmotif.id
                    dbmotif = Motif([[0.25, 0.25, 0.25, 0.25]] * pos + dbmotif.pwm)
                    dbmotif.id = tmp

                diff = len(motif) - len(dbmotif)
                if diff > 0:
                    dbmotif = Motif(dbmotif.pwm + [[0.25, 0.25, 0.25, 0.25]] * diff)
                else:
                    motif = Motif(motif.pwm + [[0.25, 0.25, 0.25, 0.25]] * -diff)

                plotdata.append((motif, dbmotif, pval))
    if args.img:
        match_plot(plotdata, args.img)
示例#8
0
    def parse(self, fname):
        """
        Convert RPMCMC output to motifs

        Parameters
        ----------
        fname : str
            File containing RPMCMC output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        pfm = []
        name = ""
        for line in open(fname):
            line = line.strip()
            if line.startswith("PFM"):
                continue
            if line.startswith("Motif"):
                if len(pfm) > 0:
                    motif = Motif(pfm)
                    motif.id = name
                    motifs.append(motif)
                name = line
                pfm = []
            else:
                if line != ("A C G T"):
                    row = line.split(" ")
                    if len(row) == 4:
                        row = [float(x) for x in row]
                        pfm.append(row)

        motif = Motif(pfm)
        motif.id = name
        motifs.append(motif)

        return motifs
示例#9
0
    def parse_out(self, fo):
        """
        Convert MotifSampler output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing MotifSampler output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}
        pseudo = 0.0  # Should be 1/sqrt(# of seqs)
        aligns = {}
        for line in fo.readlines():
            if line.startswith("#"):
                pass
            elif len(line) > 1:
                vals = line.strip().split("\t")
                m_id, site = [
                    x.strip().split(" ")[1].replace('"', "")
                    for x in vals[8].split(";")
                    if x
                ]
                # if vals[6] == "+":
                if site.upper().find("N") == -1:
                    aligns.setdefault(m_id, []).append(site)
                # else:
                #    print site, rc(site)
                #    aligns.setdefault(id, []).append(rc(site))

        for m_id, align in aligns.items():
            # print id, len(align)

            width = len(align[0])
            pfm = [[0 for x in range(4)] for x in range(width)]
            for row in align:
                for i in range(len(row)):
                    pfm[i][nucs[row[i]]] += 1
            total = float(len(align))
            pwm = [[(x + pseudo / 4) / total + (pseudo) for x in row] for row in pfm]
            m = Motif()
            m.align = align[:]
            m.pwm = pwm[:]
            m.pfm = pfm[:]
            m.id = m_id
            motifs.append(m)
        return motifs
示例#10
0
    def parse(self, fo):
        """
        Convert BioProspector output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing BioProspector output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []

        p = re.compile(
            r"^\d+\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)")
        pwm = []
        motif_id = ""
        for line in fo.readlines():
            if line.startswith("Motif #"):
                if pwm:
                    m = Motif(pwm)
                    m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
                    motifs.append(m)
                motif_id = line.split("#")[1].split(":")[0]
                pwm = []
            else:
                m = p.search(line)
                if m:
                    pwm.append(
                        [float(m.group(x)) / 100.0 for x in range(1, 5)])

        if pwm:
            m = Motif(pwm)
            m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
            motifs.append(m)
        return motifs
示例#11
0
    def parse(self, fo):
        motifs = []
        
        p = re.compile(r'^\d+\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)')
        pwm = []
        motif_id = ""
        for line in fo.readlines():
            if line.startswith("Motif #"):
                if pwm:
                    m = Motif(pwm)
                    m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
                    motifs.append(m)
                motif_id =  line.split("#")[1].split(":")[0]
                pwm = []
            else:
                m = p.search(line)
                if m:
                    pwm.append([float(m.group(x))/100.0 for x in range(1,5)])

        if pwm:
            m = Motif(pwm)
            m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
            motifs.append(m)
        return motifs
示例#12
0
    def parse(self, fo):
        motifs = []
        nucs = {"A":0,"C":1,"G":2,"T":3}

        lines = [fo.readline() for x in range(6)]
        while lines[0]:
            matrix = [[float(x) for x in line.strip().split("\t")] for line in lines[2:]]
            matrix = [[matrix[x][y] for x in range(4)] for y in range(len(matrix[0]))]
            m = Motif(matrix)
            m.id = lines[0].strip().split(" ")[-1]
            motifs.append(m)
            lines = [fo.readline() for x in range(6)]
        
        for i,motif in enumerate(motifs):
            motif.id = "%s_%s" % (self.name, i + 1)
            motif.trim(0.25)
        
        return motifs
示例#13
0
    def parse(self, fo):
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}

        lines = [fo.readline() for x in range(6)]
        while lines[0]:
            matrix = [[float(x) for x in line.strip().split("\t")]
                      for line in lines[2:]]
            matrix = [[matrix[x][y] for x in range(4)]
                      for y in range(len(matrix[0]))]
            m = Motif(matrix)
            m.id = lines[0].strip().split(" ")[-1]
            motifs.append(m)
            lines = [fo.readline() for x in range(6)]

        for i, motif in enumerate(motifs):
            motif.id = "%s_%s" % (self.name, i + 1)
            motif.trim(0.25)

        return motifs
示例#14
0
 def parse(self, fo):
     #KDIC|6.124756232026243
     #A|517.9999999999999 42.99999999999999 345.99999999999994 25.999999999999996 602.9999999999999 155.99999999999997 2.9999999999999996 91.99999999999999
     #C|5.999999999999999 4.999999999999999 2.9999999999999996 956.9999999999999 91.99999999999999 17.999999999999996 22.999999999999996 275.99999999999994
     #G|340.99999999999994 943.9999999999999 630.9999999999999 6.999999999999999 16.999999999999996 48.99999999999999 960.9999999999999 14.999999999999998
     #T|134.99999999999997 7.999999999999999 19.999999999999996 9.999999999999998 287.99999999999994 776.9999999999999 12.999999999999998 616.9999999999999
     #N|999.9999999999998
     line = fo.readline()
     while not line.startswith("A|"):
         line = fo.readline() 
     matrix = []
     for i in range(4):
         matrix.append([float(x) for x in line.strip().split("|")[1].split(" ")])
         line = fo.readline()
     #print matrix
     matrix = [[matrix[x][y] for x in range(4)] for y in range(len(matrix[0]))]
     #print matrix
     m = Motif(matrix)
     m.id = "ChIPMunk_w%s" % len(m)
     return [m]
示例#15
0
 def parse(self, fo):
     #KDIC|6.124756232026243
     #A|517.9999999999999 42.99999999999999 345.99999999999994 25.999999999999996 602.9999999999999 155.99999999999997 2.9999999999999996 91.99999999999999
     #C|5.999999999999999 4.999999999999999 2.9999999999999996 956.9999999999999 91.99999999999999 17.999999999999996 22.999999999999996 275.99999999999994
     #G|340.99999999999994 943.9999999999999 630.9999999999999 6.999999999999999 16.999999999999996 48.99999999999999 960.9999999999999 14.999999999999998
     #T|134.99999999999997 7.999999999999999 19.999999999999996 9.999999999999998 287.99999999999994 776.9999999999999 12.999999999999998 616.9999999999999
     #N|999.9999999999998
     line = fo.readline()
     while not line.startswith("A|"):
         line = fo.readline()
     matrix = []
     for i in range(4):
         matrix.append(
             [float(x) for x in line.strip().split("|")[1].split(" ")])
         line = fo.readline()
     #print matrix
     matrix = [[matrix[x][y] for x in range(4)]
               for y in range(len(matrix[0]))]
     #print matrix
     m = Motif(matrix)
     m.id = "ChIPMunk_w%s" % len(m)
     return [m]
示例#16
0
    def parse(self, fo, width, seed=None):
        """
        Convert Posmo output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing Posmo output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []

        lines = [fo.readline() for x in range(6)]
        while lines[0]:
            matrix = [[float(x) for x in line.strip().split("\t")]
                      for line in lines[2:]]
            matrix = [[matrix[x][y] for x in range(4)]
                      for y in range(len(matrix[0]))]
            m = Motif(matrix)
            m.trim(0.1)
            m.id = lines[0].strip().split(" ")[-1]
            motifs.append(m)
            lines = [fo.readline() for x in range(6)]

        for i, motif in enumerate(motifs):
            if seed:
                motif.id = "%s_w%s.%s_%s" % (self.name, width, seed, i + 1)
            else:
                motif.id = "%s_w%s_%s" % (self.name, width, i + 1)
            motif.trim(0.25)

        return motifs
示例#17
0
def cluster_motifs(motifs, match="total", metric="wic", combine="mean", pval=True, threshold=0.95, trim_edges=False, edge_ic_cutoff=0.2, include_bg=True, progress=True, ncpus=None):
    """ 
    Clusters a set of sequence motifs. Required arg 'motifs' is a file containing
    positional frequency matrices or an array with motifs.

    Optional args:

    'match', 'metric' and 'combine' specify the method used to compare and score
    the motifs. By default the WIC score is used (metric='wic'), using the the
    score over the whole alignment (match='total'), with the total motif score
    calculated as the mean score of all positions (combine='mean').
    'match' can be either 'total' for the total alignment or 'subtotal' for the 
    maximum scoring subsequence of the alignment.
    'metric' can be any metric defined in MotifComparer, currently: 'pcc', 'ed',
    'distance', 'wic' or 'chisq' 
    'combine' determines how the total score is calculated from the score of 
    individual positions and can be either 'sum' or 'mean'
    
    'pval' can be True or False and determines if the score should be converted to 
    an empirical p-value

    'threshold' determines the score (or p-value) cutoff

    If 'trim_edges' is set to True, all motif edges with an IC below 
    'edge_ic_cutoff' will be removed before clustering

    When computing the average of two motifs 'include_bg' determines if, at a 
    position only present in one motif, the information in that motif should
    be kept, or if it should be averaged with background frequencies. Should
    probably be left set to True.

    """

    
    # First read pfm or pfm formatted motiffile
    if type([]) != type(motifs):
        motifs = read_motifs(motifs, fmt="pwm")
    
    mc = MotifComparer()

    # Trim edges with low information content
    if trim_edges:
        for motif in motifs:
            motif.trim(edge_ic_cutoff)
    
    # Make a MotifTree node for every motif
    nodes = [MotifTree(m) for m in motifs]
    
    # Determine all pairwise scores and maxscore per motif
    scores = {}
    motif_nodes = dict([(n.motif.id,n) for n in nodes])
    motifs = [n.motif for n in nodes]
    
    if progress:
        sys.stderr.write("Calculating initial scores\n")
    result = mc.get_all_scores(motifs, motifs, match, metric, combine, pval, parallel=True, ncpus=ncpus)
    
    for m1, other_motifs in result.items():
        for m2, score in other_motifs.items():
            if m1 == m2:
                if pval:
                    motif_nodes[m1].maxscore = 1 - score[0]
                else:
                    motif_nodes[m1].maxscore = score[0]
            else:
                if pval:
                    score = [1 - score[0]] + score[1:]
                scores[(motif_nodes[m1],motif_nodes[m2])] = score
               
    cluster_nodes = [node for node in nodes]
    ave_count = 1
    
    total = len(cluster_nodes)

    while len(cluster_nodes) > 1:
        l = sorted(scores.keys(), key=lambda x: scores[x][0])
        i = -1
        (n1, n2) = l[i]
        while n1 not in cluster_nodes or n2 not in cluster_nodes:
            i -= 1
            (n1,n2) = l[i]
        
        if len(n1.motif) > 0 and len(n2.motif) > 0:
            (score, pos, orientation) = scores[(n1,n2)]
            ave_motif = n1.motif.average_motifs(n2.motif, pos, orientation, include_bg=include_bg)
            
            ave_motif.trim(edge_ic_cutoff)
            
            # Check if the motif is not empty
            if len(ave_motif) == 0:
                ave_motif = Motif([[0.25,0.25,0.25,0.25]])

            ave_motif.id = "Average_%s" % ave_count
            ave_count += 1
            
            new_node = MotifTree(ave_motif)
            if pval:
                new_node.maxscore = 1 - mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0]
            else:
                new_node.maxscore = mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0]
                
            new_node.mergescore = score
            #print "%s + %s = %s with score %s" % (n1.motif.id, n2.motif.id, ave_motif.id, score)
            n1.parent = new_node
            n2.parent = new_node
            new_node.left = n1
            new_node.right = n2
            
            cmp_nodes = dict([(node.motif, node) for node in nodes if not node.parent])
            
            if progress:
                progress = (1 - len(cmp_nodes) / float(total)) * 100
                sys.stderr.write('\rClustering [{0}{1}] {2}%'.format(
                    '#' * (int(progress) // 10), 
                    " " * (10 - int(progress) // 10), 
                    int(progress)))
            
            result = mc.get_all_scores(
                    [new_node.motif], 
                    list(cmp_nodes.keys()), 
                    match, 
                    metric, 
                    combine, 
                    pval, 
                    parallel=True)
            
            for motif, n in cmp_nodes.items():
                x = result[new_node.motif.id][motif.id]
                if pval:
                    x = [1 - x[0]] + x[1:]
                scores[(new_node, n)] = x
            
            nodes.append(new_node)
    
        cluster_nodes = [node for node in nodes if not node.parent]
         
    if progress:
        sys.stderr.write("\n") 
    root = nodes[-1]
    for node in [node for node in nodes if not node.left]:
        node.parent.checkMerge(root, threshold)
    
    return root
示例#18
0
文件: motifs.py 项目: loosolab/TOBIAS
    def create_consensus(self, metric="pcc"):
        """ Create consensus motif from MotifList """

        from gimmemotifs.motif import Motif
        from gimmemotifs.comparison import MotifComparer

        self = [
            motif.get_gimmemotif() if motif.gimme_obj is None else motif
            for motif in self
        ]  #fill in gimme_obj if it is not found
        motif_list = [motif.gimme_obj
                      for motif in self]  #list of gimmemotif objects

        if len(motif_list) > 1:
            consensus_found = False
            mc = MotifComparer()

            #Initialize score_dict
            score_dict = mc.get_all_scores(motif_list,
                                           motif_list,
                                           match="total",
                                           metric=metric,
                                           combine="mean")

            while not consensus_found:

                #Which motifs to merge?
                best_similarity_motifs = sorted(
                    find_best_pair(motif_list, score_dict)
                )  #indices of most similar motifs in cluster_motifs

                #Merge
                new_motif = merge_motifs(motif_list[best_similarity_motifs[0]],
                                         motif_list[best_similarity_motifs[1]],
                                         metric=metric)

                del (motif_list[best_similarity_motifs[1]])
                motif_list[best_similarity_motifs[0]] = new_motif

                if len(motif_list) == 1:  #done merging
                    consensus_found = True

                else:  #Update score_dict

                    #add the comparison of the new motif to the score_dict
                    score_dict[new_motif.id] = score_dict.get(new_motif.id, {})

                    for m in motif_list:
                        score_dict[new_motif.id][m.id] = mc.compare_motifs(
                            new_motif, m, metric=metric)
                        score_dict[m.id][new_motif.id] = mc.compare_motifs(
                            m, new_motif, metric=metric)

        #Round pwm values
        gimmemotif_consensus = motif_list[0]
        gimme_id = gimmemotif_consensus.id
        pwm = [[round(f, 5) for f in l] for l in gimmemotif_consensus.pwm]
        gimmemotif_consensus = Motif(
            pwm)  #create new motif with the rounded values
        gimmemotif_consensus.id = gimme_id

        #Convert back to OneMotif obj
        onemotif_consensus = gimmemotif_to_onemotif(gimmemotif_consensus)
        onemotif_consensus.gimme_obj = gimmemotif_consensus

        #Control the naming of the new motif
        all_names = [motif.name for motif in self]
        onemotif_consensus.name = ",".join(all_names[:3])
        onemotif_consensus.name += "(...)" if len(all_names) > 3 else ""

        return (onemotif_consensus)
示例#19
0
def cluster_motifs(motifs,
                   match="total",
                   metric="wic",
                   combine="mean",
                   pval=True,
                   threshold=0.95,
                   trim_edges=False,
                   edge_ic_cutoff=0.2,
                   include_bg=True,
                   progress=True,
                   ncpus=None):
    """ 
    Clusters a set of sequence motifs. Required arg 'motifs' is a file containing
    positional frequency matrices or an array with motifs.

    Optional args:

    'match', 'metric' and 'combine' specify the method used to compare and score
    the motifs. By default the WIC score is used (metric='wic'), using the the
    score over the whole alignment (match='total'), with the total motif score
    calculated as the mean score of all positions (combine='mean').
    'match' can be either 'total' for the total alignment or 'subtotal' for the 
    maximum scoring subsequence of the alignment.
    'metric' can be any metric defined in MotifComparer, currently: 'pcc', 'ed',
    'distance', 'wic' or 'chisq' 
    'combine' determines how the total score is calculated from the score of 
    individual positions and can be either 'sum' or 'mean'
    
    'pval' can be True or False and determines if the score should be converted to 
    an empirical p-value

    'threshold' determines the score (or p-value) cutoff

    If 'trim_edges' is set to True, all motif edges with an IC below 
    'edge_ic_cutoff' will be removed before clustering

    When computing the average of two motifs 'include_bg' determines if, at a 
    position only present in one motif, the information in that motif should
    be kept, or if it should be averaged with background frequencies. Should
    probably be left set to True.

    """

    # First read pfm or pfm formatted motiffile
    if type([]) != type(motifs):
        motifs = read_motifs(motifs, fmt="pwm")

    mc = MotifComparer()

    # Trim edges with low information content
    if trim_edges:
        for motif in motifs:
            motif.trim(edge_ic_cutoff)

    # Make a MotifTree node for every motif
    nodes = [MotifTree(m) for m in motifs]

    # Determine all pairwise scores and maxscore per motif
    scores = {}
    motif_nodes = dict([(n.motif.id, n) for n in nodes])
    motifs = [n.motif for n in nodes]

    if progress:
        sys.stderr.write("Calculating initial scores\n")
    result = mc.get_all_scores(motifs,
                               motifs,
                               match,
                               metric,
                               combine,
                               pval,
                               parallel=True,
                               ncpus=ncpus)

    for m1, other_motifs in result.items():
        for m2, score in other_motifs.items():
            if m1 == m2:
                if pval:
                    motif_nodes[m1].maxscore = 1 - score[0]
                else:
                    motif_nodes[m1].maxscore = score[0]
            else:
                if pval:
                    score = [1 - score[0]] + score[1:]
                scores[(motif_nodes[m1], motif_nodes[m2])] = score

    cluster_nodes = [node for node in nodes]
    ave_count = 1

    total = len(cluster_nodes)

    while len(cluster_nodes) > 1:
        l = sorted(scores.keys(), key=lambda x: scores[x][0])
        i = -1
        (n1, n2) = l[i]
        while n1 not in cluster_nodes or n2 not in cluster_nodes:
            i -= 1
            (n1, n2) = l[i]

        if len(n1.motif) > 0 and len(n2.motif) > 0:
            (score, pos, orientation) = scores[(n1, n2)]
            ave_motif = n1.motif.average_motifs(n2.motif,
                                                pos,
                                                orientation,
                                                include_bg=include_bg)

            ave_motif.trim(edge_ic_cutoff)

            # Check if the motif is not empty
            if len(ave_motif) == 0:
                ave_motif = Motif([[0.25, 0.25, 0.25, 0.25]])

            ave_motif.id = "Average_%s" % ave_count
            ave_count += 1

            new_node = MotifTree(ave_motif)
            if pval:
                new_node.maxscore = 1 - mc.compare_motifs(
                    new_node.motif, new_node.motif, match, metric, combine,
                    pval)[0]
            else:
                new_node.maxscore = mc.compare_motifs(new_node.motif,
                                                      new_node.motif, match,
                                                      metric, combine, pval)[0]

            new_node.mergescore = score
            #print "%s + %s = %s with score %s" % (n1.motif.id, n2.motif.id, ave_motif.id, score)
            n1.parent = new_node
            n2.parent = new_node
            new_node.left = n1
            new_node.right = n2

            cmp_nodes = dict([(node.motif, node) for node in nodes
                              if not node.parent])

            if progress:
                progress = (1 - len(cmp_nodes) / float(total)) * 100
                sys.stderr.write('\rClustering [{0}{1}] {2}%'.format(
                    '#' * (int(progress) // 10),
                    " " * (10 - int(progress) // 10), int(progress)))

            result = mc.get_all_scores([new_node.motif],
                                       list(cmp_nodes.keys()),
                                       match,
                                       metric,
                                       combine,
                                       pval,
                                       parallel=True)

            for motif, n in cmp_nodes.items():
                x = result[new_node.motif.id][motif.id]
                if pval:
                    x = [1 - x[0]] + x[1:]
                scores[(new_node, n)] = x

            nodes.append(new_node)

        cluster_nodes = [node for node in nodes if not node.parent]

    if progress:
        sys.stderr.write("\n")
    root = nodes[-1]
    for node in [node for node in nodes if not node.left]:
        node.parent.checkMerge(root, threshold)

    return root