def parse(self, fo): motifs = [] p = re.compile( r'^\d+\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)') pwm = [] motif_id = "" for line in fo.readlines(): if line.startswith("Motif #"): if pwm: m = Motif(pwm) m.id = "BioProspector_w%s_%s" % (len(m), motif_id) motifs.append(m) motif_id = line.split("#")[1].split(":")[0] pwm = [] else: m = p.search(line) if m: pwm.append( [float(m.group(x)) / 100.0 for x in range(1, 5)]) if pwm: m = Motif(pwm) m.id = "BioProspector_w%s_%s" % (len(m), motif_id) motifs.append(m) return motifs
def match(args): sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)]) db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)]) mc = MotifComparer() result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean") print("Motif\tMatch\tScore\tP-value") for motif, match in result.items(): pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True) print("%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval)) if args.img: plotdata = [] for query, match in result.items(): motif = sample[query] dbmotif = db[match[0]] pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True) if orient == -1: tmp = dbmotif.id dbmotif = dbmotif.rc() dbmotif.id = tmp if pos < 0: tmp = motif.id motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm) motif.id = tmp elif pos > 0: tmp = dbmotif.id dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm) dbmotif.id = tmp plotdata.append((motif, dbmotif, pval)) match_plot(plotdata, args.img)
def test6_pcc(self): pfm1 = [[5, 0, 0, 0], [0, 5, 0, 0], [0, 5, 0, 0], [0, 0, 0, 5]] pfm2 = [[5, 0, 0, 0], [0, 5, 0, 0], [0, 5, 0, 0], [0, 0, 0, 5]] m1 = Motif(pfm1) m2 = Motif(pfm2) self.assertEqual(4, m1.max_pcc(m2)[0])
def parse(self, fo): """ Convert MDmodule output to motifs Parameters ---------- fo : file-like File object containing MDmodule output. Returns ------- motifs : list List of Motif instances. """ motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} p = re.compile( r"(\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)") pf = re.compile(r">.+\s+[bf]\d+\s+(\w+)") pwm = [] pfm = [] align = [] m_id = "" for line in fo.readlines(): if line.startswith("Motif"): if m_id: motifs.append(Motif()) motifs[-1].id = m_id motifs[-1].pwm = pwm motifs[-1].pfm = pfm motifs[-1].align = align pwm = [] pfm = [] align = [] m_id = line.split("\t")[0] else: m = p.search(line) if m: pwm.append([float(m.group(x)) / 100 for x in [2, 3, 4, 5]]) m = pf.search(line) if m: if not pfm: pfm = [[0 for x in range(4)] for x in range(len(m.group(1)))] for i in range(len(m.group(1))): pfm[i][nucs[m.group(1)[i]]] += 1 align.append(m.group(1)) if pwm: motifs.append(Motif()) motifs[-1].id = m_id motifs[-1].pwm = pwm motifs[-1].pfm = pfm motifs[-1].align = align return motifs
def test8_pwm_to_str_hash(self): pwm = [[0.01, 0.01, 0.01, 0.97], [0.123, 0.456, 0.222, 0.199]] m = Motif(pwm) h = "1f260320cac8c26a" self.assertEqual(h, m.hash()) pwm = [ [0.010000, 0.010000, 0.010000, 0.970000], [0.12300, 0.45600, 0.22200, 0.19900], ] m = Motif(pwm) self.assertEqual(h, m.hash())
def test_motif_export_import(self): pfm = [ [120, 0, 0, 0], [120, 0, 0, 0], [0, 60, 60, 0], [0, 0, 0, 120], [0, 0, 0, 120], ] motif = Motif(pfm) motif.id = "test_motif" f = StringIO(motif.to_transfac()) motif_from_file = read_motifs(f, fmt="transfac")[0] self.assertEqual("AASTT", motif_from_file.to_consensus().upper()) self.assertEqual("test_motif", motif_from_file.id) f = StringIO(motif.to_meme()) motif_from_file = read_motifs(f, fmt="meme")[0] self.assertEqual("AASTT", motif_from_file.to_consensus().upper()) self.assertEqual("test_motif", motif_from_file.id) f = StringIO(motif.to_motevo()) motif_from_file = read_motifs(f, fmt="transfac")[0] self.assertEqual("AASTT", motif_from_file.to_consensus().upper()) self.assertEqual("test_motif", motif_from_file.id)
def parse(self, fo): """ Convert ChIPMunk output to motifs Parameters ---------- fo : file-like File object containing ChIPMunk output. Returns ------- motifs : list List of Motif instances. """ line = fo.readline() if not line: return [] while not line.startswith("A|"): line = fo.readline() matrix = [] for _ in range(4): matrix.append( [float(x) for x in line.strip().split("|")[1].split(" ")]) line = fo.readline() # print matrix matrix = [[matrix[x][y] for x in range(4)] for y in range(len(matrix[0]))] # print matrix m = Motif(matrix) m.id = "ChIPMunk_w%s" % len(m) return [m]
def parse(self, fo): """ Convert MotifSampler output to motifs Parameters ---------- fo : file-like File object containing MotifSampler output. Returns ------- motifs : list List of Motif instances. """ motifs = [] pwm = [] info = {} for line in fo.readlines(): if line.startswith("#"): vals = line.strip()[1:].split(" = ") if len(vals) > 1: info[vals[0]] = vals[1] elif len(line) > 1: pwm.append([float(x) for x in line.strip().split("\t")]) else: motifs.append(Motif()) motifs[-1].consensus = info["Consensus"] motifs[-1].width = info["W"] motifs[-1].id = info["ID"] motifs[-1].pwm = pwm[:] pwm = [] return motifs
def parse(self, fo): motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} p = re.compile( r'(\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)') pf = re.compile(r'>.+\s+[bf]\d+\s+(\w+)') pwm = [] pfm = [] align = [] id = "" for line in fo.readlines(): if line.startswith("Motif"): if id: motifs.append(Motif()) motifs[-1].id = id motifs[-1].pwm = pwm motifs[-1].pfm = pfm motifs[-1].align = align pwm = [] pfm = [] align = [] id = line.split("\t")[0] else: m = p.search(line) if m: pwm.append([float(m.group(x)) / 100 for x in [2, 3, 4, 5]]) m = pf.search(line) if m: if not pfm: pfm = [[0 for x in range(4)] for x in range(len(m.group(1)))] for i in range(len(m.group(1))): pfm[i][nucs[m.group(1)[i]]] += 1 align.append(m.group(1)) if pwm: motifs.append(Motif()) motifs[-1].id = id motifs[-1].pwm = pwm motifs[-1].pfm = pfm motifs[-1].align = align return motifs
def test9_logodds_matrix(self): pwm = [[0.5, 0.4, 0.1, 0.0], [0.25, 0.25, 0.25, 0.25]] logodds = np.array([ [0.69813, 0.47623, -0.89160, -4.60517], [0.00995, 0.00995, 0.00995, 0.00995], ]) m = Motif(pwm) np.testing.assert_almost_equal(logodds, np.array(m.logodds), decimal=5)
def match(args): sample = dict([(m.id, m) for m in read_motifs(args.pfmfile)]) db = dict([(m.id, m) for m in read_motifs(args.dbpfmfile)]) mc = MotifComparer() result = mc.get_best_matches( sample.values(), args.nmatches, db.values(), "partial", "seqcor", "mean" ) plotdata = [] print("Motif\tMatch\tScore\tP-value") for motif_name, matches in result.items(): for match in matches: pval, pos, orient = mc.compare_motifs( sample[motif_name], db[match[0]], "partial", "seqcor", "mean", pval=True ) print("%s\t%s\t%0.2f\t%0.3e" % (motif_name, match[0], match[1][0], pval)) motif = sample[motif_name] dbmotif = db[match[0]] if args.img: if orient == -1: tmp = dbmotif.id dbmotif = dbmotif.rc() dbmotif.id = tmp if pos < 0: tmp = motif.id motif = Motif([[0.25, 0.25, 0.25, 0.25]] * -pos + motif.pwm) motif.id = tmp elif pos > 0: tmp = dbmotif.id dbmotif = Motif([[0.25, 0.25, 0.25, 0.25]] * pos + dbmotif.pwm) dbmotif.id = tmp diff = len(motif) - len(dbmotif) if diff > 0: dbmotif = Motif(dbmotif.pwm + [[0.25, 0.25, 0.25, 0.25]] * diff) else: motif = Motif(motif.pwm + [[0.25, 0.25, 0.25, 0.25]] * -diff) plotdata.append((motif, dbmotif, pval)) if args.img: match_plot(plotdata, args.img)
def parse(self, fo): motifs = [] m = [[float(x) for x in fo.readline().strip().split(" ")] for i in range(4)] matrix = [[m[0][i], m[1][i], m[2][i], m[3][i]] for i in range(len(m[0]))] motifs = [Motif(matrix)] motifs[-1].id = self.name return motifs
def test5_motif_to_img(self): """ Motif to img """ seqlogo = which("seqlogo") if seqlogo: m = Motif(self.pfm) m.to_img("test/test.png", fmt="png", seqlogo=seqlogo) self.assertTrue(os.path.exists("test/test.png")) os.unlink("test/test.png") else: print("seqlogo not found, skipping.")
def test8_pwm_to_str(self): pwm = [[0.01, 0.01, 0.01, 0.97], [0.123, 0.456, 0.222, 0.199]] m = Motif(pwm) s2 = "0.01\t0.01\t0.01\t0.97\n0.12\t0.46\t0.22\t0.20" s3 = "0.010\t0.010\t0.010\t0.970\n0.123\t0.456\t0.222\t0.199" self.assertEqual(s2, m._pwm_to_str(precision=2)) self.assertEqual(s3, m._pwm_to_str(precision=3))
def parse(self, fo): """ Convert GADEM output to motifs Parameters ---------- fo : file-like File object containing GADEM output. Returns ------- motifs : list List of Motif instances. """ motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} lines = fo.readlines() for i in range(0, len(lines), 5): align = [] pwm = [] pfm = [] m_id = "" line = lines[i].strip() m_id = line[1:] number = m_id.split("_")[0][1:] if os.path.exists("%s.seq" % number): with open("%s.seq" % number) as f: for line in f: if "x" not in line and "n" not in line: line = line.strip().upper() align.append(line) if not pfm: pfm = [[0 for x in range(4)] for x in range(len(line))] for p in range(len(line)): pfm[p][nucs[line[p]]] += 1 m = [ line.strip().split(" ")[1].split("\t") for line in lines[i + 1:i + 5] ] pwm = [[float(m[x][y]) for x in range(4)] for y in range(len(m[0]))] motifs.append(Motif(pwm)) motifs[-1].id = "{}_{}".format(self.name, m_id) # motifs[-1].pwm = pwm if align: motifs[-1].pfm = pfm motifs[-1].align = align return motifs
def parse(self, fname): """ Convert RPMCMC output to motifs Parameters ---------- fname : str File containing RPMCMC output. Returns ------- motifs : list List of Motif instances. """ motifs = [] pfm = [] name = "" for line in open(fname): line = line.strip() if line.startswith("PFM"): continue if line.startswith("Motif"): if len(pfm) > 0: motif = Motif(pfm) motif.id = name motifs.append(motif) name = line pfm = [] else: if line != ("A C G T"): row = line.split(" ") if len(row) == 4: row = [float(x) for x in row] pfm.append(row) motif = Motif(pfm) motif.id = name motifs.append(motif) return motifs
def parse(self, fo): """ Convert MEME output to motifs Parameters ---------- fo : file-like File object containing MEME output. Returns ------- motifs : list List of Motif instances. """ motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} p = re.compile( r"MOTIF.+MEME-(\d+)\s*width\s*=\s*(\d+)\s+sites\s*=\s*(\d+)") pa = re.compile(r"\)\s+([A-Z]+)") line = fo.readline() while line: m = p.search(line) align = [] pfm = None if m: # print(m.group(0)) id = "%s_%s_w%s" % (self.name, m.group(1), m.group(2)) while not line.startswith("//"): ma = pa.search(line) if ma: # print(ma.group(0)) match = ma.group(1) align.append(match) if not pfm: pfm = [[0 for x in range(4)] for x in range(len(match))] for pos in range(len(match)): if match[pos] in nucs: pfm[pos][nucs[match[pos]]] += 1 else: for i in range(4): pfm[pos][i] += 0.25 line = fo.readline() motifs.append(Motif(pfm[:])) motifs[-1].id = id motifs[-1].align = align[:] line = fo.readline() return motifs
def parse_out(self, fo): """ Convert MotifSampler output to motifs Parameters ---------- fo : file-like File object containing MotifSampler output. Returns ------- motifs : list List of Motif instances. """ motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} pseudo = 0.0 # Should be 1/sqrt(# of seqs) aligns = {} for line in fo.readlines(): if line.startswith("#"): pass elif len(line) > 1: vals = line.strip().split("\t") m_id, site = [ x.strip().split(" ")[1].replace('"', "") for x in vals[8].split(";") if x ] # if vals[6] == "+": if site.upper().find("N") == -1: aligns.setdefault(m_id, []).append(site) # else: # print site, rc(site) # aligns.setdefault(id, []).append(rc(site)) for m_id, align in aligns.items(): # print id, len(align) width = len(align[0]) pfm = [[0 for x in range(4)] for x in range(width)] for row in align: for i in range(len(row)): pfm[i][nucs[row[i]]] += 1 total = float(len(align)) pwm = [[(x + pseudo / 4) / total + (pseudo) for x in row] for row in pfm] m = Motif() m.align = align[:] m.pwm = pwm[:] m.pfm = pfm[:] m.id = m_id motifs.append(m) return motifs
def parse(self, fo): """ Convert BioProspector output to motifs Parameters ---------- fo : file-like File object containing BioProspector output. Returns ------- motifs : list List of Motif instances. """ motifs = [] p = re.compile( r"^\d+\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)") pwm = [] motif_id = "" for line in fo.readlines(): if line.startswith("Motif #"): if pwm: m = Motif(pwm) m.id = "BioProspector_w%s_%s" % (len(m), motif_id) motifs.append(m) motif_id = line.split("#")[1].split(":")[0] pwm = [] else: m = p.search(line) if m: pwm.append( [float(m.group(x)) / 100.0 for x in range(1, 5)]) if pwm: m = Motif(pwm) m.id = "BioProspector_w%s_%s" % (len(m), motif_id) motifs.append(m) return motifs
def parse(self, fo): motifs = [] #160: 112 CACGTGC 7.25 chr14:32308489-32308689 p = re.compile(r'\d+\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)') wm = [] name = "" for line in fo.readlines(): if line.startswith("Motif") and line.strip().endswith(":"): if name: motifs.append(Motif(wm)) motifs[-1].id = name name = "" wm = [] name = "%s_%s" % (self.name, line.split(":")[0]) else: m = p.search(line) if m: wm.append([float(m.group(x)) for x in range(1, 5)]) motifs.append(Motif(wm)) motifs[-1].id = name return motifs
def parse(self, fo): """ Convert AMD output to motifs Parameters ---------- fo : file-like File object containing AMD output. Returns ------- motifs : list List of Motif instances. """ motifs = [] # 160: 112 CACGTGC 7.25 chr14:32308489-32308689 p = re.compile(r"\d+\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)") wm = [] name = "" for line in fo.readlines(): if line.startswith("Motif") and line.strip().endswith(":"): if name: motifs.append(Motif(wm)) motifs[-1].id = name name = "" wm = [] name = "%s_%s" % (self.name, line.split(":")[0]) else: m = p.search(line) if m: wm.append([float(m.group(x)) for x in range(1, 5)]) motifs.append(Motif(wm)) motifs[-1].id = name return motifs
def get_gimmemotif(self): """ Get gimmemotif object for motif Reads counts from self.counts """ from gimmemotifs.motif import Motif self.length = len(self.counts[0]) motif_rows = [] for pos_id in range(self.length): row = [self.counts[letter][pos_id] for letter in range(4)] # each row represents one position in motif ( A C G T ) motif_rows.append(row) self.gimme_obj = Motif(motif_rows) # generate gimmemotif motif instance self.gimme_obj.id = self.id + " " + self.name return(self)
def test11_slice_motif(self): pfm = [ [120, 0, 0, 0], [120, 0, 0, 0], [0, 60, 60, 0], [0, 0, 0, 120], [0, 0, 0, 120], ] m = Motif(pfm) m.to_consensus() # take slice m2 = m[1:-1] self.assertEqual("AST", m2.consensus.upper()) self.assertEqual(pfm[1:-1], m2.pfm)
def parse(self, fo): motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} lines = [fo.readline() for x in range(6)] while lines[0]: matrix = [[float(x) for x in line.strip().split("\t")] for line in lines[2:]] matrix = [[matrix[x][y] for x in range(4)] for y in range(len(matrix[0]))] m = Motif(matrix) m.id = lines[0].strip().split(" ")[-1] motifs.append(m) lines = [fo.readline() for x in range(6)] for i, motif in enumerate(motifs): motif.id = "%s_%s" % (self.name, i + 1) motif.trim(0.25) return motifs
def parse(self, fo): motifs = [] pwm = [] info = {} for line in fo.readlines(): if line.startswith("#"): vals = line.strip()[1:].split(" = ") if len(vals) > 1: info[vals[0]] = vals[1] elif len(line) > 1: pwm.append([float(x) for x in line.strip().split("\t")]) else: motifs.append(Motif()) motifs[-1].consensus = info["Consensus"] motifs[-1].width = info["W"] motifs[-1].id = info["ID"] motifs[-1].pwm = pwm[:] pwm = [] return motifs
def parse(self, fo): """ Convert HMS output to motifs Parameters ---------- fo : file-like File object containing HMS output. Returns ------- motifs : list List of Motif instances. """ motifs = [] m = [[float(x) for x in fo.readline().strip().split(" ")] for i in range(4)] matrix = [[m[0][i], m[1][i], m[2][i], m[3][i]] for i in range(len(m[0]))] motifs = [Motif(matrix)] motifs[-1].id = self.name return motifs
def parse(self, fo): motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} lines = fo.readlines() for i in range(0, len(lines), 5): align = [] pwm = [] pfm = [] id = "" line = lines[i].strip() id = line[1:] number = id.split("_")[0][1:] if os.path.exists("%s.seq" % number): for l in open("%s.seq" % number).readlines(): if not "x" in l and not "n" in l: l = l.strip().upper() align.append(l) if not pfm: pfm = [[0 for x in range(4)] for x in range(len(l))] for p in range(len(l)): pfm[p][nucs[l[p]]] += 1 m = [ l.strip().split(" ")[1].split("\t") for l in lines[i + 1:i + 5] ] pwm = [[float(m[x][y]) for x in range(4)] for y in range(len(m[0]))] motifs.append(Motif(pwm)) motifs[-1].id = id #motifs[-1].pwm = pwm if align: pass motifs[-1].pfm = pfm motifs[-1].align = align return motifs
def parse(self, fo): motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} p = re.compile(r'(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)') pa = re.compile(r'(\s*\d+\s+.\s+([ACGT]+)\s+.+\))') pwm = [] align = [] c = 1 for line in fo.readlines(): m = p.search(line) if m: pwm.append([int(m.group(x)) for x in [2, 3, 4, 5]]) m = pa.search(line) if m: align.append(m.group(2)) elif line.startswith("===="): motifs.append(Motif()) #total = float(pwm[0][0] + pwm[0][1] + pwm[0][2] + pwm[0][3]) #motifs[-1].pwm = [[x / total for x in row] for row in pwm] motifs[-1].id = "Weeder_%s" % c motifs[-1].align = align[:] width = len(align[0]) pfm = [[0 for x in range(4)] for x in range(width)] for row in align: for i in range(len(row)): pfm[i][nucs[row[i]]] += 1 total = float(len(align)) pwm = [[((x) / total) for x in row] for row in pfm] motifs[-1].pwm = pwm[:] motifs[-1].pfm = pfm[:] align = [] c += 1 pwm = [] return motifs
def parse(self, fo): #KDIC|6.124756232026243 #A|517.9999999999999 42.99999999999999 345.99999999999994 25.999999999999996 602.9999999999999 155.99999999999997 2.9999999999999996 91.99999999999999 #C|5.999999999999999 4.999999999999999 2.9999999999999996 956.9999999999999 91.99999999999999 17.999999999999996 22.999999999999996 275.99999999999994 #G|340.99999999999994 943.9999999999999 630.9999999999999 6.999999999999999 16.999999999999996 48.99999999999999 960.9999999999999 14.999999999999998 #T|134.99999999999997 7.999999999999999 19.999999999999996 9.999999999999998 287.99999999999994 776.9999999999999 12.999999999999998 616.9999999999999 #N|999.9999999999998 line = fo.readline() while not line.startswith("A|"): line = fo.readline() matrix = [] for i in range(4): matrix.append( [float(x) for x in line.strip().split("|")[1].split(" ")]) line = fo.readline() #print matrix matrix = [[matrix[x][y] for x in range(4)] for y in range(len(matrix[0]))] #print matrix m = Motif(matrix) m.id = "ChIPMunk_w%s" % len(m) return [m]
def parse(self, fo): """ Convert Improbizer output to motifs Parameters ---------- fo : file-like File object containing Improbizer output. Returns ------- motifs : list List of Motif instances. """ motifs = [] p = re.compile(r"\d+\s+@\s+\d+\.\d+\s+sd\s+\d+\.\d+\s+(\w+)$") line = fo.readline() while line and line.find("Color") == -1: m = p.search(line) if m: pwm_data = {} for _i in range(4): vals = [ x.strip() for x in fo.readline().strip().split(" ") if x ] pwm_data[vals[0].upper()] = vals[1:] pwm = [] for i in range(len(pwm_data["A"])): pwm.append( [float(pwm_data[x][i]) for x in ["A", "C", "G", "T"]]) motifs.append(Motif(pwm)) motifs[-1].id = "%s_%s" % (self.name, m.group(1)) line = fo.readline() return motifs