Пример #1
0
    def parse(self, fo):
        """
        Convert ChIPMunk output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing ChIPMunk output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        line = fo.readline()
        if not line:
            return []

        while not line.startswith("A|"):
            line = fo.readline()
        matrix = []
        for _ in range(4):
            matrix.append(
                [float(x) for x in line.strip().split("|")[1].split(" ")])
            line = fo.readline()
        # print matrix
        matrix = [[matrix[x][y] for x in range(4)]
                  for y in range(len(matrix[0]))]
        # print matrix
        m = Motif(matrix)
        m.id = "ChIPMunk_w%s" % len(m)
        return [m]
Пример #2
0
    def parse_out(self, fo):
        motifs = []
        nucs = {"A":0,"C":1,"G":2,"T":3}
        pseudo = 0.0 # Should be 1/sqrt(# of seqs)
        aligns = {}
        for line in fo.readlines():
            if line.startswith("#"):
                pass
            elif len(line) > 1:
                vals = line.strip().split("\t")
                id, site = [x.strip().split(" ")[1].replace('"',"") for x in vals[8].split(";") if x]
                #if vals[6] == "+":
                if site.upper().find("N") == -1:
                    aligns.setdefault(id, []).append(site)
                #else:
                #    print site, rc(site)
                #    aligns.setdefault(id, []).append(rc(site))
                        
        for id, align in aligns.items():
            #print id, len(align)

            width = len(align[0])
            pfm =  [[0 for x in range(4)] for x in range(width)]
            for row in align:
                for i in range(len(row)):
                    pfm[i][nucs[row[i]]] += 1
            total = float(len(align))
            pwm = [[(x + pseudo/4)/total+(pseudo) for x in row] for row in pfm]
            m = Motif()
            m.align = align[:]
            m.pwm = pwm[:]
            m.pfm = pfm[:]
            m.id = id
            motifs.append(m)    
        return motifs
Пример #3
0
def match(args):
    sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)])
    db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)])

    mc = MotifComparer()
    result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean")

    print "Motif\tMatch\tScore\tP-value"
    for motif, match in result.items():
        pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True)
        print "%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval)

    if args.img:
        plotdata = []
        for query, match in result.items():
            motif = sample[query]
            dbmotif = db[match[0]]
            pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True)
            
            if orient == -1:
                tmp = dbmotif.id
                dbmotif = dbmotif.rc()
                dbmotif.id = tmp

            if pos < 0:
                tmp = motif.id
                motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm)
                motif.id = tmp
            elif pos > 0:
                tmp = dbmotif.id
                dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm)
                dbmotif.id = tmp

            plotdata.append((motif, dbmotif, pval))
            match_plot(plotdata, args.img)
Пример #4
0
    def test6_pcc(self):
        pfm1 = [[5, 0, 0, 0], [0, 5, 0, 0], [0, 5, 0, 0], [0, 0, 0, 5]]
        pfm2 = [[5, 0, 0, 0], [0, 5, 0, 0], [0, 5, 0, 0], [0, 0, 0, 5]]

        m1 = Motif(pfm1)
        m2 = Motif(pfm2)

        self.assertEqual(4, m1.max_pcc(m2)[0])
Пример #5
0
    def parse(self, fo):
        """
        Convert MDmodule output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing MDmodule output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}
        p = re.compile(
            r"(\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)")
        pf = re.compile(r">.+\s+[bf]\d+\s+(\w+)")

        pwm = []
        pfm = []
        align = []
        m_id = ""
        for line in fo.readlines():
            if line.startswith("Motif"):
                if m_id:
                    motifs.append(Motif())
                    motifs[-1].id = m_id
                    motifs[-1].pwm = pwm
                    motifs[-1].pfm = pfm
                    motifs[-1].align = align
                    pwm = []
                    pfm = []
                    align = []
                m_id = line.split("\t")[0]
            else:
                m = p.search(line)
                if m:
                    pwm.append([float(m.group(x)) / 100 for x in [2, 3, 4, 5]])
                m = pf.search(line)
                if m:
                    if not pfm:
                        pfm = [[0 for x in range(4)]
                               for x in range(len(m.group(1)))]
                    for i in range(len(m.group(1))):
                        pfm[i][nucs[m.group(1)[i]]] += 1

                    align.append(m.group(1))

        if pwm:
            motifs.append(Motif())
            motifs[-1].id = m_id
            motifs[-1].pwm = pwm
            motifs[-1].pfm = pfm
            motifs[-1].align = align

        return motifs
Пример #6
0
 def test5_motif_to_img(self):
     """ Motif to img """
     seqlogo = which("seqlogo")
     if seqlogo:
         m = Motif(self.pfm)
         m.to_img("test/test.png", fmt="png", seqlogo=seqlogo)
         self.assertTrue(os.path.exists("test/test.png"))
         os.unlink("test/test.png")
     else:
         print("seqlogo not found, skipping.")
Пример #7
0
    def test8_pwm_to_str(self):
        pwm = [[0.01, 0.01, 0.01, 0.97], [0.123, 0.456, 0.222, 0.199]]

        m = Motif(pwm)

        s2 = "0.01\t0.01\t0.01\t0.97\n0.12\t0.46\t0.22\t0.20"
        s3 = "0.010\t0.010\t0.010\t0.970\n0.123\t0.456\t0.222\t0.199"

        self.assertEqual(s2, m._pwm_to_str(precision=2))
        self.assertEqual(s3, m._pwm_to_str(precision=3))
Пример #8
0
    def parse(self, fo):
        """
        Convert MotifSampler output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing MotifSampler output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []

        pwm = []
        info = {}
        for line in fo.readlines():
            if line.startswith("#"):
                vals = line.strip()[1:].split(" = ")
                if len(vals) > 1:
                    info[vals[0]] = vals[1]
            elif len(line) > 1:
                pwm.append([float(x) for x in line.strip().split("\t")])
            else:
                motifs.append(Motif())
                motifs[-1].consensus = info["Consensus"]
                motifs[-1].width = info["W"]
                motifs[-1].id = info["ID"]
                motifs[-1].pwm = pwm[:]
                pwm = []

        return motifs
Пример #9
0
    def test11_slice_motif(self):
        pfm = [
            [120, 0, 0, 0],
            [120, 0, 0, 0],
            [0, 60, 60, 0],
            [0, 0, 0, 120],
            [0, 0, 0, 120],
        ]

        m = Motif(pfm)
        m.to_consensus()

        # take slice
        m2 = m[1:-1]

        self.assertEqual("AST", m2.consensus.upper())
        self.assertEqual(pfm[1:-1], m2.pfm)
Пример #10
0
    def get_gimmemotif(self):
        """ Get gimmemotif object for motif 
			Reads counts from self.counts """

        self.length = len(self.counts[0])

        motif_rows = []
        for pos_id in range(self.length):
            row = [self.counts[letter][pos_id] for letter in range(4)
                   ]  # each row represents one position in motif ( A C G T )
            motif_rows.append(row)

        self.gimme_obj = Motif(
            motif_rows)  # generate gimmemotif motif instance
        self.gimme_obj.id = self.id + " " + self.name

        return (self)
Пример #11
0
    def parse(self, fo):
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}
        p = re.compile(
            r'(\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)')
        pf = re.compile(r'>.+\s+[bf]\d+\s+(\w+)')

        pwm = []
        pfm = []
        align = []
        id = ""
        for line in fo.readlines():
            if line.startswith("Motif"):
                if id:
                    motifs.append(Motif())
                    motifs[-1].id = id
                    motifs[-1].pwm = pwm
                    motifs[-1].pfm = pfm
                    motifs[-1].align = align
                    pwm = []
                    pfm = []
                    align = []
                id = line.split("\t")[0]
            else:
                m = p.search(line)
                if m:
                    pwm.append([float(m.group(x)) / 100 for x in [2, 3, 4, 5]])
                m = pf.search(line)
                if m:
                    if not pfm:
                        pfm = [[0 for x in range(4)]
                               for x in range(len(m.group(1)))]
                    for i in range(len(m.group(1))):
                        pfm[i][nucs[m.group(1)[i]]] += 1

                    align.append(m.group(1))

        if pwm:
            motifs.append(Motif())
            motifs[-1].id = id
            motifs[-1].pwm = pwm
            motifs[-1].pfm = pfm
            motifs[-1].align = align

        return motifs
Пример #12
0
    def parse(self, fo):
        motifs = []

        p = re.compile(
            r'^\d+\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)')
        pwm = []
        motif_id = ""
        for line in fo.readlines():
            if line.startswith("Motif #"):
                if pwm:
                    m = Motif(pwm)
                    m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
                    motifs.append(m)
                motif_id = line.split("#")[1].split(":")[0]
                pwm = []
            else:
                m = p.search(line)
                if m:
                    pwm.append(
                        [float(m.group(x)) / 100.0 for x in range(1, 5)])

        if pwm:
            m = Motif(pwm)
            m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
            motifs.append(m)
        return motifs
Пример #13
0
    def test9_logodds_matrix(self):
        pwm = [[0.5, 0.4, 0.1, 0.0], [0.25, 0.25, 0.25, 0.25]]

        logodds = np.array([
            [0.69813, 0.47623, -0.89160, -4.60517],
            [0.00995, 0.00995, 0.00995, 0.00995],
        ])
        m = Motif(pwm)
        np.testing.assert_almost_equal(logodds, np.array(m.logodds), decimal=5)
Пример #14
0
    def parse(self, fo):
        motifs = []
        nucs = {"A":0,"C":1,"G":2,"T":3}

        lines = [fo.readline() for x in range(6)]
        while lines[0]:
            matrix = [[float(x) for x in line.strip().split("\t")] for line in lines[2:]]
            matrix = [[matrix[x][y] for x in range(4)] for y in range(len(matrix[0]))]
            m = Motif(matrix)
            m.id = lines[0].strip().split(" ")[-1]
            motifs.append(m)
            lines = [fo.readline() for x in range(6)]
        
        for i,motif in enumerate(motifs):
            motif.id = "%s_%s" % (self.name, i + 1)
            motif.trim(0.25)
        
        return motifs
Пример #15
0
def match(args):
    sample = dict([(m.id, m) for m in read_motifs(args.pfmfile)])
    db = dict([(m.id, m) for m in read_motifs(args.dbpfmfile)])

    mc = MotifComparer()
    result = mc.get_best_matches(
        sample.values(), args.nmatches, db.values(), "partial", "seqcor", "mean"
    )

    plotdata = []
    print("Motif\tMatch\tScore\tP-value")
    for motif_name, matches in result.items():
        for match in matches:

            pval, pos, orient = mc.compare_motifs(
                sample[motif_name], db[match[0]], "partial", "seqcor", "mean", pval=True
            )
            print("%s\t%s\t%0.2f\t%0.3e" % (motif_name, match[0], match[1][0], pval))
            motif = sample[motif_name]
            dbmotif = db[match[0]]

            if args.img:
                if orient == -1:
                    tmp = dbmotif.id
                    dbmotif = dbmotif.rc()
                    dbmotif.id = tmp
                if pos < 0:
                    tmp = motif.id
                    motif = Motif([[0.25, 0.25, 0.25, 0.25]] * -pos + motif.pwm)
                    motif.id = tmp
                elif pos > 0:
                    tmp = dbmotif.id
                    dbmotif = Motif([[0.25, 0.25, 0.25, 0.25]] * pos + dbmotif.pwm)
                    dbmotif.id = tmp

                diff = len(motif) - len(dbmotif)
                if diff > 0:
                    dbmotif = Motif(dbmotif.pwm + [[0.25, 0.25, 0.25, 0.25]] * diff)
                else:
                    motif = Motif(motif.pwm + [[0.25, 0.25, 0.25, 0.25]] * -diff)

                plotdata.append((motif, dbmotif, pval))
    if args.img:
        match_plot(plotdata, args.img)
Пример #16
0
    def parse(self, fo):
        motifs = []
        m = [[float(x) for x in fo.readline().strip().split(" ")]
             for i in range(4)]
        matrix = [[m[0][i], m[1][i], m[2][i], m[3][i]]
                  for i in range(len(m[0]))]
        motifs = [Motif(matrix)]
        motifs[-1].id = self.name

        return motifs
Пример #17
0
    def parse(self, fo):
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}

        lines = [fo.readline() for x in range(6)]
        while lines[0]:
            matrix = [[float(x) for x in line.strip().split("\t")]
                      for line in lines[2:]]
            matrix = [[matrix[x][y] for x in range(4)]
                      for y in range(len(matrix[0]))]
            m = Motif(matrix)
            m.id = lines[0].strip().split(" ")[-1]
            motifs.append(m)
            lines = [fo.readline() for x in range(6)]

        for i, motif in enumerate(motifs):
            motif.id = "%s_%s" % (self.name, i + 1)
            motif.trim(0.25)

        return motifs
Пример #18
0
    def parse(self, fo):
        """
        Convert GADEM output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing GADEM output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}

        lines = fo.readlines()
        for i in range(0, len(lines), 5):
            align = []
            pwm = []
            pfm = []
            m_id = ""
            line = lines[i].strip()
            m_id = line[1:]
            number = m_id.split("_")[0][1:]
            if os.path.exists("%s.seq" % number):
                with open("%s.seq" % number) as f:
                    for line in f:
                        if "x" not in line and "n" not in line:
                            line = line.strip().upper()
                            align.append(line)
                            if not pfm:
                                pfm = [[0 for x in range(4)]
                                       for x in range(len(line))]
                            for p in range(len(line)):
                                pfm[p][nucs[line[p]]] += 1

            m = [
                line.strip().split(" ")[1].split("\t")
                for line in lines[i + 1:i + 5]
            ]

            pwm = [[float(m[x][y]) for x in range(4)]
                   for y in range(len(m[0]))]

            motifs.append(Motif(pwm))
            motifs[-1].id = "{}_{}".format(self.name, m_id)
            # motifs[-1].pwm = pwm
            if align:
                motifs[-1].pfm = pfm
                motifs[-1].align = align

        return motifs
Пример #19
0
 def parse(self, fo):
     #KDIC|6.124756232026243
     #A|517.9999999999999 42.99999999999999 345.99999999999994 25.999999999999996 602.9999999999999 155.99999999999997 2.9999999999999996 91.99999999999999
     #C|5.999999999999999 4.999999999999999 2.9999999999999996 956.9999999999999 91.99999999999999 17.999999999999996 22.999999999999996 275.99999999999994
     #G|340.99999999999994 943.9999999999999 630.9999999999999 6.999999999999999 16.999999999999996 48.99999999999999 960.9999999999999 14.999999999999998
     #T|134.99999999999997 7.999999999999999 19.999999999999996 9.999999999999998 287.99999999999994 776.9999999999999 12.999999999999998 616.9999999999999
     #N|999.9999999999998
     line = fo.readline()
     while not line.startswith("A|"):
         line = fo.readline() 
     matrix = []
     for i in range(4):
         matrix.append([float(x) for x in line.strip().split("|")[1].split(" ")])
         line = fo.readline()
     #print matrix
     matrix = [[matrix[x][y] for x in range(4)] for y in range(len(matrix[0]))]
     #print matrix
     m = Motif(matrix)
     m.id = "ChIPMunk_w%s" % len(m)
     return [m]
Пример #20
0
def match(args):
    sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)])
    db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)])

    mc = MotifComparer()
    result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean")

    print("Motif\tMatch\tScore\tP-value")
    for motif, match in result.items():
        pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True)
        print("%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval))

    if args.img:
        plotdata = []
        for query, match in result.items():
            motif = sample[query]
            dbmotif = db[match[0]]
            pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True)
            
            if orient == -1:
                tmp = dbmotif.id
                dbmotif = dbmotif.rc()
                dbmotif.id = tmp

            if pos < 0:
                tmp = motif.id
                motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm)
                motif.id = tmp
            elif pos > 0:
                tmp = dbmotif.id
                dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm)
                dbmotif.id = tmp

            plotdata.append((motif, dbmotif, pval))
            match_plot(plotdata, args.img)
Пример #21
0
    def test_motif_export_import(self):
        pfm = [
            [120, 0, 0, 0],
            [120, 0, 0, 0],
            [0, 60, 60, 0],
            [0, 0, 0, 120],
            [0, 0, 0, 120],
        ]
        motif = Motif(pfm)
        motif.id = "test_motif"

        f = StringIO(motif.to_transfac())
        motif_from_file = read_motifs(f, fmt="transfac")[0]
        self.assertEqual("AASTT", motif_from_file.to_consensus().upper())
        self.assertEqual("test_motif", motif_from_file.id)

        f = StringIO(motif.to_meme())
        motif_from_file = read_motifs(f, fmt="meme")[0]
        self.assertEqual("AASTT", motif_from_file.to_consensus().upper())
        self.assertEqual("test_motif", motif_from_file.id)

        f = StringIO(motif.to_motevo())
        motif_from_file = read_motifs(f, fmt="transfac")[0]
        self.assertEqual("AASTT", motif_from_file.to_consensus().upper())
        self.assertEqual("test_motif", motif_from_file.id)
Пример #22
0
    def parse(self, fo):
        """
        Convert MEME output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing MEME output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}

        p = re.compile(
            r"MOTIF.+MEME-(\d+)\s*width\s*=\s*(\d+)\s+sites\s*=\s*(\d+)")
        pa = re.compile(r"\)\s+([A-Z]+)")
        line = fo.readline()
        while line:
            m = p.search(line)
            align = []
            pfm = None
            if m:
                # print(m.group(0))
                id = "%s_%s_w%s" % (self.name, m.group(1), m.group(2))
                while not line.startswith("//"):
                    ma = pa.search(line)
                    if ma:
                        # print(ma.group(0))
                        match = ma.group(1)
                        align.append(match)
                        if not pfm:
                            pfm = [[0 for x in range(4)]
                                   for x in range(len(match))]
                        for pos in range(len(match)):
                            if match[pos] in nucs:
                                pfm[pos][nucs[match[pos]]] += 1
                            else:
                                for i in range(4):
                                    pfm[pos][i] += 0.25

                    line = fo.readline()

                motifs.append(Motif(pfm[:]))
                motifs[-1].id = id
                motifs[-1].align = align[:]
            line = fo.readline()

        return motifs
Пример #23
0
 def parse(self, fo):
     #KDIC|6.124756232026243
     #A|517.9999999999999 42.99999999999999 345.99999999999994 25.999999999999996 602.9999999999999 155.99999999999997 2.9999999999999996 91.99999999999999
     #C|5.999999999999999 4.999999999999999 2.9999999999999996 956.9999999999999 91.99999999999999 17.999999999999996 22.999999999999996 275.99999999999994
     #G|340.99999999999994 943.9999999999999 630.9999999999999 6.999999999999999 16.999999999999996 48.99999999999999 960.9999999999999 14.999999999999998
     #T|134.99999999999997 7.999999999999999 19.999999999999996 9.999999999999998 287.99999999999994 776.9999999999999 12.999999999999998 616.9999999999999
     #N|999.9999999999998
     line = fo.readline()
     while not line.startswith("A|"):
         line = fo.readline()
     matrix = []
     for i in range(4):
         matrix.append(
             [float(x) for x in line.strip().split("|")[1].split(" ")])
         line = fo.readline()
     #print matrix
     matrix = [[matrix[x][y] for x in range(4)]
               for y in range(len(matrix[0]))]
     #print matrix
     m = Motif(matrix)
     m.id = "ChIPMunk_w%s" % len(m)
     return [m]
Пример #24
0
    def parse(self, fo):
        motifs = []

        #160:  112  CACGTGC      7.25   chr14:32308489-32308689
        p = re.compile(r'\d+\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)')
        wm = []
        name = ""
        for line in fo.readlines():
            if line.startswith("Motif") and line.strip().endswith(":"):
                if name:
                    motifs.append(Motif(wm))
                    motifs[-1].id = name
                    name = ""
                    wm = []
                name = "%s_%s" % (self.name, line.split(":")[0])
            else:
                m = p.search(line)
                if m:
                    wm.append([float(m.group(x)) for x in range(1, 5)])
        motifs.append(Motif(wm))
        motifs[-1].id = name

        return motifs
Пример #25
0
    def parse(self, fo):
        """
        Convert AMD output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing AMD output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []

        # 160:  112  CACGTGC      7.25   chr14:32308489-32308689
        p = re.compile(r"\d+\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)")
        wm = []
        name = ""
        for line in fo.readlines():
            if line.startswith("Motif") and line.strip().endswith(":"):
                if name:
                    motifs.append(Motif(wm))
                    motifs[-1].id = name
                    name = ""
                    wm = []
                name = "%s_%s" % (self.name, line.split(":")[0])
            else:
                m = p.search(line)
                if m:
                    wm.append([float(m.group(x)) for x in range(1, 5)])
        motifs.append(Motif(wm))
        motifs[-1].id = name

        return motifs
Пример #26
0
    def parse(self, fo):
        motifs = []
        
        p = re.compile(r'^\d+\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)')
        pwm = []
        motif_id = ""
        for line in fo.readlines():
            if line.startswith("Motif #"):
                if pwm:
                    m = Motif(pwm)
                    m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
                    motifs.append(m)
                motif_id =  line.split("#")[1].split(":")[0]
                pwm = []
            else:
                m = p.search(line)
                if m:
                    pwm.append([float(m.group(x))/100.0 for x in range(1,5)])

        if pwm:
            m = Motif(pwm)
            m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
            motifs.append(m)
        return motifs
Пример #27
0
    def parse(self, fo, width, seed=None):
        """
        Convert Posmo output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing Posmo output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []

        lines = [fo.readline() for x in range(6)]
        while lines[0]:
            matrix = [[float(x) for x in line.strip().split("\t")]
                      for line in lines[2:]]
            matrix = [[matrix[x][y] for x in range(4)]
                      for y in range(len(matrix[0]))]
            m = Motif(matrix)
            m.trim(0.1)
            m.id = lines[0].strip().split(" ")[-1]
            motifs.append(m)
            lines = [fo.readline() for x in range(6)]

        for i, motif in enumerate(motifs):
            if seed:
                motif.id = "%s_w%s.%s_%s" % (self.name, width, seed, i + 1)
            else:
                motif.id = "%s_w%s_%s" % (self.name, width, i + 1)
            motif.trim(0.25)

        return motifs
Пример #28
0
    def test8_pwm_to_str_hash(self):
        pwm = [[0.01, 0.01, 0.01, 0.97], [0.123, 0.456, 0.222, 0.199]]
        m = Motif(pwm)
        h = "1f260320cac8c26a"
        self.assertEqual(h, m.hash())

        pwm = [
            [0.010000, 0.010000, 0.010000, 0.970000],
            [0.12300, 0.45600, 0.22200, 0.19900],
        ]
        m = Motif(pwm)
        self.assertEqual(h, m.hash())
Пример #29
0
    def parse_out(self, fo):
        """
        Convert MotifSampler output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing MotifSampler output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}
        pseudo = 0.0  # Should be 1/sqrt(# of seqs)
        aligns = {}
        for line in fo.readlines():
            if line.startswith("#"):
                pass
            elif len(line) > 1:
                vals = line.strip().split("\t")
                m_id, site = [
                    x.strip().split(" ")[1].replace('"', "")
                    for x in vals[8].split(";")
                    if x
                ]
                # if vals[6] == "+":
                if site.upper().find("N") == -1:
                    aligns.setdefault(m_id, []).append(site)
                # else:
                #    print site, rc(site)
                #    aligns.setdefault(id, []).append(rc(site))

        for m_id, align in aligns.items():
            # print id, len(align)

            width = len(align[0])
            pfm = [[0 for x in range(4)] for x in range(width)]
            for row in align:
                for i in range(len(row)):
                    pfm[i][nucs[row[i]]] += 1
            total = float(len(align))
            pwm = [[(x + pseudo / 4) / total + (pseudo) for x in row] for row in pfm]
            m = Motif()
            m.align = align[:]
            m.pwm = pwm[:]
            m.pfm = pfm[:]
            m.id = m_id
            motifs.append(m)
        return motifs
Пример #30
0
    def parse(self, fo):
        motifs = []

        pwm = []
        info = {}
        for line in fo.readlines():
            if line.startswith("#"):
                vals = line.strip()[1:].split(" = ")
                if len(vals) > 1:
                    info[vals[0]] = vals[1]
            elif len(line) > 1:
                pwm.append([float(x) for x in line.strip().split("\t")])
            else:
                motifs.append(Motif())
                motifs[-1].consensus = info["Consensus"]
                motifs[-1].width = info["W"]
                motifs[-1].id = info["ID"]
                motifs[-1].pwm = pwm[:]
                pwm = []

        return motifs
Пример #31
0
    def parse(self, fo):
        """
        Convert HMS output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing HMS output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        m = [[float(x) for x in fo.readline().strip().split(" ")] for i in range(4)]
        matrix = [[m[0][i], m[1][i], m[2][i], m[3][i]] for i in range(len(m[0]))]
        motifs = [Motif(matrix)]
        motifs[-1].id = self.name

        return motifs
Пример #32
0
    def parse(self, fo):
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}

        lines = fo.readlines()
        for i in range(0, len(lines), 5):
            align = []
            pwm = []
            pfm = []
            id = ""
            line = lines[i].strip()
            id = line[1:]
            number = id.split("_")[0][1:]
            if os.path.exists("%s.seq" % number):
                for l in open("%s.seq" % number).readlines():
                    if not "x" in l and not "n" in l:
                        l = l.strip().upper()
                        align.append(l)
                        if not pfm:
                            pfm = [[0 for x in range(4)]
                                   for x in range(len(l))]
                        for p in range(len(l)):
                            pfm[p][nucs[l[p]]] += 1

            m = [
                l.strip().split(" ")[1].split("\t") for l in lines[i + 1:i + 5]
            ]

            pwm = [[float(m[x][y]) for x in range(4)]
                   for y in range(len(m[0]))]

            motifs.append(Motif(pwm))
            motifs[-1].id = id
            #motifs[-1].pwm = pwm
            if align:
                pass
                motifs[-1].pfm = pfm
                motifs[-1].align = align

        return motifs
Пример #33
0
    def parse(self, fo):
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}

        p = re.compile(r'(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)')
        pa = re.compile(r'(\s*\d+\s+.\s+([ACGT]+)\s+.+\))')
        pwm = []
        align = []
        c = 1
        for line in fo.readlines():
            m = p.search(line)
            if m:
                pwm.append([int(m.group(x)) for x in [2, 3, 4, 5]])

            m = pa.search(line)
            if m:
                align.append(m.group(2))

            elif line.startswith("===="):
                motifs.append(Motif())
                #total = float(pwm[0][0] + pwm[0][1] + pwm[0][2] + pwm[0][3])
                #motifs[-1].pwm = [[x / total for x in row] for row in pwm]
                motifs[-1].id = "Weeder_%s" % c
                motifs[-1].align = align[:]

                width = len(align[0])
                pfm = [[0 for x in range(4)] for x in range(width)]
                for row in align:
                    for i in range(len(row)):
                        pfm[i][nucs[row[i]]] += 1
                total = float(len(align))
                pwm = [[((x) / total) for x in row] for row in pfm]
                motifs[-1].pwm = pwm[:]
                motifs[-1].pfm = pfm[:]

                align = []
                c += 1
                pwm = []

        return motifs
Пример #34
0
    def parse(self, fo):
        """
        Convert Improbizer output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing Improbizer output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        p = re.compile(r"\d+\s+@\s+\d+\.\d+\s+sd\s+\d+\.\d+\s+(\w+)$")

        line = fo.readline()
        while line and line.find("Color") == -1:
            m = p.search(line)
            if m:
                pwm_data = {}
                for _i in range(4):
                    vals = [
                        x.strip() for x in fo.readline().strip().split(" ")
                        if x
                    ]
                    pwm_data[vals[0].upper()] = vals[1:]
                pwm = []
                for i in range(len(pwm_data["A"])):
                    pwm.append(
                        [float(pwm_data[x][i]) for x in ["A", "C", "G", "T"]])
                motifs.append(Motif(pwm))
                motifs[-1].id = "%s_%s" % (self.name, m.group(1))
            line = fo.readline()

        return motifs
Пример #35
0
    def parse(self, fo):
        """
        Convert BioProspector output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing BioProspector output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []

        p = re.compile(
            r"^\d+\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)")
        pwm = []
        motif_id = ""
        for line in fo.readlines():
            if line.startswith("Motif #"):
                if pwm:
                    m = Motif(pwm)
                    m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
                    motifs.append(m)
                motif_id = line.split("#")[1].split(":")[0]
                pwm = []
            else:
                m = p.search(line)
                if m:
                    pwm.append(
                        [float(m.group(x)) / 100.0 for x in range(1, 5)])

        if pwm:
            m = Motif(pwm)
            m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
            motifs.append(m)
        return motifs
Пример #36
0
def cluster_motifs(motifs, match="total", metric="wic", combine="mean", pval=True, threshold=0.95, trim_edges=False, edge_ic_cutoff=0.2, include_bg=True, progress=True, ncpus=None):
    """ 
    Clusters a set of sequence motifs. Required arg 'motifs' is a file containing
    positional frequency matrices or an array with motifs.

    Optional args:

    'match', 'metric' and 'combine' specify the method used to compare and score
    the motifs. By default the WIC score is used (metric='wic'), using the the
    score over the whole alignment (match='total'), with the total motif score
    calculated as the mean score of all positions (combine='mean').
    'match' can be either 'total' for the total alignment or 'subtotal' for the 
    maximum scoring subsequence of the alignment.
    'metric' can be any metric defined in MotifComparer, currently: 'pcc', 'ed',
    'distance', 'wic' or 'chisq' 
    'combine' determines how the total score is calculated from the score of 
    individual positions and can be either 'sum' or 'mean'
    
    'pval' can be True or False and determines if the score should be converted to 
    an empirical p-value

    'threshold' determines the score (or p-value) cutoff

    If 'trim_edges' is set to True, all motif edges with an IC below 
    'edge_ic_cutoff' will be removed before clustering

    When computing the average of two motifs 'include_bg' determines if, at a 
    position only present in one motif, the information in that motif should
    be kept, or if it should be averaged with background frequencies. Should
    probably be left set to True.

    """

    
    # First read pfm or pfm formatted motiffile
    if type([]) != type(motifs):
        motifs = read_motifs(motifs, fmt="pwm")
    
    mc = MotifComparer()

    # Trim edges with low information content
    if trim_edges:
        for motif in motifs:
            motif.trim(edge_ic_cutoff)
    
    # Make a MotifTree node for every motif
    nodes = [MotifTree(m) for m in motifs]
    
    # Determine all pairwise scores and maxscore per motif
    scores = {}
    motif_nodes = dict([(n.motif.id,n) for n in nodes])
    motifs = [n.motif for n in nodes]
    
    if progress:
        sys.stderr.write("Calculating initial scores\n")
    result = mc.get_all_scores(motifs, motifs, match, metric, combine, pval, parallel=True, ncpus=ncpus)
    
    for m1, other_motifs in result.items():
        for m2, score in other_motifs.items():
            if m1 == m2:
                if pval:
                    motif_nodes[m1].maxscore = 1 - score[0]
                else:
                    motif_nodes[m1].maxscore = score[0]
            else:
                if pval:
                    score = [1 - score[0]] + score[1:]
                scores[(motif_nodes[m1],motif_nodes[m2])] = score
               
    cluster_nodes = [node for node in nodes]
    ave_count = 1
    
    total = len(cluster_nodes)

    while len(cluster_nodes) > 1:
        l = sorted(scores.keys(), key=lambda x: scores[x][0])
        i = -1
        (n1, n2) = l[i]
        while n1 not in cluster_nodes or n2 not in cluster_nodes:
            i -= 1
            (n1,n2) = l[i]
        
        if len(n1.motif) > 0 and len(n2.motif) > 0:
            (score, pos, orientation) = scores[(n1,n2)]
            ave_motif = n1.motif.average_motifs(n2.motif, pos, orientation, include_bg=include_bg)
            
            ave_motif.trim(edge_ic_cutoff)
            
            # Check if the motif is not empty
            if len(ave_motif) == 0:
                ave_motif = Motif([[0.25,0.25,0.25,0.25]])

            ave_motif.id = "Average_%s" % ave_count
            ave_count += 1
            
            new_node = MotifTree(ave_motif)
            if pval:
                new_node.maxscore = 1 - mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0]
            else:
                new_node.maxscore = mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0]
                
            new_node.mergescore = score
            #print "%s + %s = %s with score %s" % (n1.motif.id, n2.motif.id, ave_motif.id, score)
            n1.parent = new_node
            n2.parent = new_node
            new_node.left = n1
            new_node.right = n2
            
            cmp_nodes = dict([(node.motif, node) for node in nodes if not node.parent])
            
            if progress:
                progress = (1 - len(cmp_nodes) / float(total)) * 100
                sys.stderr.write('\rClustering [{0}{1}] {2}%'.format(
                    '#' * (int(progress) // 10), 
                    " " * (10 - int(progress) // 10), 
                    int(progress)))
            
            result = mc.get_all_scores(
                    [new_node.motif], 
                    list(cmp_nodes.keys()), 
                    match, 
                    metric, 
                    combine, 
                    pval, 
                    parallel=True)
            
            for motif, n in cmp_nodes.items():
                x = result[new_node.motif.id][motif.id]
                if pval:
                    x = [1 - x[0]] + x[1:]
                scores[(new_node, n)] = x
            
            nodes.append(new_node)
    
        cluster_nodes = [node for node in nodes if not node.parent]
         
    if progress:
        sys.stderr.write("\n") 
    root = nodes[-1]
    for node in [node for node in nodes if not node.left]:
        node.parent.checkMerge(root, threshold)
    
    return root