def parse(handle, format): """Parses an output file of motif finding programs. Currently supported formats (case is ignored): - AlignAce: AlignAce output file format - MEME: MEME output file motif - MAST: MAST output file motif - TRANSFAC: TRANSFAC database file format - pfm: JASPAR-style position-frequency matrix - jaspar: JASPAR-style multiple PFM format - sites: JASPAR-style sites file As files in the pfm and sites formats contain only a single motif, it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() for those. For example: >>> from Bio import motifs >>> for m in motifs.parse(open("Motif/alignace.out"), "AlignAce"): ... print(m.consensus) TCTACGATTGAG CTGCAGCTAGCTACGAGTGAG GTGCTCTAAGCATAGTAGGCG GCCACTAGCAGAGCAGGGGGC CGACTCAGAGGTT CCACGCTAAGAGAGGTGCCGGAG GCGCGTCGCTGAGCA GTCCATCGCAAAGCGTGGGGC GGGATCAGAGGGCCG TGGAGGCGGGG GACCAGAGCTTCGCATGGGGG GGCGTGCGTG GCTGGTTGCTGTTCATTAGG GCCGGCGGCAGCTAAAAGGG GAGGCCGGGGAT CGACTCGTGCTTAGAAGG """ format = format.lower() if format=="alignace": from Bio.motifs import alignace record = alignace.read(handle) return record elif format=="meme": from Bio.motifs import meme record = meme.read(handle) return record elif format=="mast": from Bio.motifs import mast record = mast.read(handle) return record elif format=="transfac": from Bio.motifs import transfac record = transfac.read(handle) return record elif format in ('pfm', 'sites', 'jaspar'): from Bio.motifs import jaspar record = jaspar.read(handle, format) return record else: raise ValueError("Unknown format %s" % format)
def transfac2ic(args): """Compute information content for each filter motif (.transfac).""" train_samples = np.load(args.train, mmap_mode='r') probs = np.mean(np.mean(train_samples, axis=1), axis=0) # background bg = {'A': probs[0], 'C': probs[1], 'G': probs[2], 'T': probs[3]} # create output directory out_dir = os.path.dirname(args.out_file) if out_dir == "": out_dir = "." if not os.path.exists(out_dir): os.makedirs(out_dir) # load all filter motifs with open(args.in_file) as handle: records = transfac.read(handle) # for each motif compute IC: for m in records: pwm = m.counts.normalize(pseudocounts=bg) pwm.background = bg # pssm = pwm.log_odds(background = bg)) ic = compute_mean_ic(pwm) with open(args.out_file, "a") as file: file.write(m.get("ID") + "\t" + str(ic) + "\n")
def parse(handle, format): """Parses an output file of motif finding programs. Currently supported formats (case is ignored): - AlignAce: AlignAce output file format - MEME: MEME output file motif - MAST: MAST output file motif - TRANSFAC: TRANSFAC database file format - pfm: JASPAR-style position-frequency matrix - jaspar: JASPAR-style multiple PFM format - sites: JASPAR-style sites file As files in the pfm and sites formats contain only a single motif, it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() for those. For example: >>> from Bio import motifs >>> for m in motifs.parse(open("Motif/alignace.out"),"AlignAce"): ... print m.consensus TCTACGATTGAG CTGCAGCTAGCTACGAGTGAG GTGCTCTAAGCATAGTAGGCG GCCACTAGCAGAGCAGGGGGC CGACTCAGAGGTT CCACGCTAAGAGAGGTGCCGGAG GCGCGTCGCTGAGCA GTCCATCGCAAAGCGTGGGGC GGGATCAGAGGGCCG TGGAGGCGGGG GACCAGAGCTTCGCATGGGGG GGCGTGCGTG GCTGGTTGCTGTTCATTAGG GCCGGCGGCAGCTAAAAGGG GAGGCCGGGGAT CGACTCGTGCTTAGAAGG """ format = format.lower() if format == "alignace": from Bio.motifs import alignace record = alignace.read(handle) return record elif format == "meme": from Bio.motifs import meme record = meme.read(handle) return record elif format == "mast": from Bio.motifs import mast record = mast.read(handle) return record elif format == "transfac": from Bio.motifs import transfac record = transfac.read(handle) return record elif format in ('pfm', 'sites', 'jaspar'): from Bio.motifs import jaspar record = jaspar.read(handle, format) return record else: raise ValueError("Unknown format %s" % format)
def parse(handle, format, strict=True): """Parse an output file from a motif finding program. Currently supported formats (case is ignored): - AlignAce: AlignAce output file format - ClusterBuster: Cluster Buster position frequency matrix format - XMS: XMS matrix format - MEME: MEME output file motif - MINIMAL: MINIMAL MEME output file motif - MAST: MAST output file motif - TRANSFAC: TRANSFAC database file format - pfm-four-columns: Generic position-frequency matrix format with four columns. (cisbp, homer, hocomoco, neph, tiffin) - pfm-four-rows: Generic position-frequency matrix format with four row. (scertf, yetfasco, hdpi, idmmpmm, flyfactor survey) - pfm: JASPAR-style position-frequency matrix - jaspar: JASPAR-style multiple PFM format - sites: JASPAR-style sites file As files in the pfm and sites formats contain only a single motif, it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() for those. For example: >>> from Bio import motifs >>> with open("motifs/alignace.out") as handle: ... for m in motifs.parse(handle, "AlignAce"): ... print(m.consensus) ... TCTACGATTGAG CTGCACCTAGCTACGAGTGAG GTGCCCTAAGCATACTAGGCG GCCACTAGCAGAGCAGGGGGC CGACTCAGAGGTT CCACGCTAAGAGAAGTGCCGGAG GCACGTCCCTGAGCA GTCCATCGCAAAGCGTGGGGC GAGATCAGAGGGCCG TGGACGCGGGG GACCAGAGCCTCGCATGGGGG AGCGCGCGTG GCCGGTTGCTGTTCATTAGG ACCGACGGCAGCTAAAAGGG GACGCCGGGGAT CGACTCGCGCTTACAAGG If strict is True (default), the parser will raise a ValueError if the file contents does not strictly comply with the specified file format. """ format = format.lower() if format == "alignace": from Bio.motifs import alignace return alignace.read(handle) elif format == "meme": from Bio.motifs import meme return meme.read(handle) elif format == "minimal": from Bio.motifs import minimal return minimal.read(handle) elif format == "clusterbuster": from Bio.motifs import clusterbuster return clusterbuster.read(handle) elif format in ("pfm-four-columns", "pfm-four-rows"): from Bio.motifs import pfm return pfm.read(handle, format) elif format == "xms": from Bio.motifs import xms return xms.read(handle) elif format == "mast": from Bio.motifs import mast return mast.read(handle) elif format == "transfac": from Bio.motifs import transfac return transfac.read(handle, strict) elif format in ("pfm", "sites", "jaspar"): from Bio.motifs import jaspar return jaspar.read(handle, format) else: raise ValueError("Unknown format %s" % format)
def parse(handle, format, strict=True): """Parse an output file from a motif finding program. Currently supported formats (case is ignored): - AlignAce: AlignAce output file format - MEME: MEME output file motif - MINIMAL: MINIMAL MEME output file motif - MAST: MAST output file motif - TRANSFAC: TRANSFAC database file format - pfm: JASPAR-style position-frequency matrix - jaspar: JASPAR-style multiple PFM format - sites: JASPAR-style sites file As files in the pfm and sites formats contain only a single motif, it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() for those. For example: >>> from Bio import motifs >>> with open("Motif/alignace.out") as handle: ... for m in motifs.parse(handle, "AlignAce"): ... print(m.consensus) ... TCTACGATTGAG CTGCAGCTAGCTACGAGTGAG GTGCTCTAAGCATAGTAGGCG GCCACTAGCAGAGCAGGGGGC CGACTCAGAGGTT CCACGCTAAGAGAGGTGCCGGAG GCGCGTCGCTGAGCA GTCCATCGCAAAGCGTGGGGC GGGATCAGAGGGCCG TGGAGGCGGGG GACCAGAGCTTCGCATGGGGG GGCGTGCGTG GCTGGTTGCTGTTCATTAGG GCCGGCGGCAGCTAAAAGGG GAGGCCGGGGAT CGACTCGTGCTTAGAAGG If strict is True (default), the parser will raise a ValueError if the file contents does not strictly comply with the specified file format. """ format = format.lower() if format == "alignace": from Bio.motifs import alignace record = alignace.read(handle) return record elif format == "meme": from Bio.motifs import meme record = meme.read(handle) return record elif format == "minimal": from Bio.motifs import minimal record = minimal.read(handle) return record elif format == "mast": from Bio.motifs import mast record = mast.read(handle) return record elif format == "transfac": from Bio.motifs import transfac record = transfac.read(handle, strict) return record elif format in ('pfm', 'sites', 'jaspar'): from Bio.motifs import jaspar record = jaspar.read(handle, format) return record else: raise ValueError("Unknown format %s" % format)
def parse(handle, format, strict=True): """Parse an output file from a motif finding program. Currently supported formats (case is ignored): - AlignAce: AlignAce output file format - ClusterBuster: Cluster Buster position frequency matrix format - XMS: XMS matrix format - MEME: MEME output file motif - MINIMAL: MINIMAL MEME output file motif - MAST: MAST output file motif - TRANSFAC: TRANSFAC database file format - pfm-four-columns: Generic position-frequency matrix format with four columns. (cisbp, homer, hocomoco, neph, tiffin) - pfm-four-rows: Generic position-frequency matrix format with four row. (scertf, yetfasco, hdpi, idmmpmm, flyfactor survey) - pfm: JASPAR-style position-frequency matrix - jaspar: JASPAR-style multiple PFM format - sites: JASPAR-style sites file As files in the pfm and sites formats contain only a single motif, it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() for those. For example: >>> from Bio import motifs >>> with open("motifs/alignace.out") as handle: ... for m in motifs.parse(handle, "AlignAce"): ... print(m.consensus) ... TCTACGATTGAG CTGCACCTAGCTACGAGTGAG GTGCCCTAAGCATACTAGGCG GCCACTAGCAGAGCAGGGGGC CGACTCAGAGGTT CCACGCTAAGAGAAGTGCCGGAG GCACGTCCCTGAGCA GTCCATCGCAAAGCGTGGGGC GAGATCAGAGGGCCG TGGACGCGGGG GACCAGAGCCTCGCATGGGGG AGCGCGCGTG GCCGGTTGCTGTTCATTAGG ACCGACGGCAGCTAAAAGGG GACGCCGGGGAT CGACTCGCGCTTACAAGG If strict is True (default), the parser will raise a ValueError if the file contents does not strictly comply with the specified file format. """ format = format.lower() if format == "alignace": from Bio.motifs import alignace return alignace.read(handle) elif format == "meme": from Bio.motifs import meme return meme.read(handle) elif format == "minimal": from Bio.motifs import minimal return minimal.read(handle) elif format == "clusterbuster": from Bio.motifs import clusterbuster return clusterbuster.read(handle) elif format in ('pfm-four-columns', 'pfm-four-rows'): from Bio.motifs import pfm return pfm.read(handle, format) elif format == "xms": from Bio.motifs import xms return xms.read(handle) elif format == "mast": from Bio.motifs import mast return mast.read(handle) elif format == "transfac": from Bio.motifs import transfac return transfac.read(handle, strict) elif format in ('pfm', 'sites', 'jaspar'): from Bio.motifs import jaspar return jaspar.read(handle, format) else: raise ValueError("Unknown format %s" % format)
def motif_compare(args): """Compare PSSMs of filter motifs.""" # create output directory if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) # load training data to determine background nucleotide content train_samples = np.load(args.train_data, mmap_mode='r') probs = np.mean(np.mean(train_samples, axis=1), axis=0) bg = {'A': probs[0], 'C': probs[1], 'G': probs[2], 'T': probs[3]} # load all filter motifs from first file with open(args.in_file1) as handle: records1 = transfac.read(handle) # load all filter motifs from second file with open(args.in_file2) as handle: records2 = transfac.read(handle) # convert motifs to pssm's pssms1 = {} pssms2 = {} rc_pssms2 = {} for idx, m1 in enumerate(records1): pwm1 = m1.counts.normalize(pseudocounts=bg) pssm1 = pwm1.log_odds(background=bg) pssms1[m1.get("ID")] = pssm1 for idx, m2 in enumerate(records2): pwm2 = m2.counts.normalize(pseudocounts=bg) pssm2 = pwm2.log_odds(background=bg) pssms2[m2.get("ID")] = pssm2 # build reverse complement if args.rc: rc_pssm2 = pssm2.reverse_complement() rc_pssms2[idx] = rc_pssm2 result_table = [] # compare motifs for idx1, pssm1 in pssms1.items(): for idx2, pssm2 in pssms2.items(): if args.extensively or idx1 == idx2: row = [idx1, idx2] for measure in [pearsonr, spearmanr]: cor, p_value, offset = get_motif_similarity(measure, pssm1, pssm2, args.min_overlap if args.shift else pssm1.length) orientation = "+" if args.rc: rc_pssm2 = rc_pssms2[idx2] cor_rc, p_value_rc, offset_rc = get_motif_similarity(measure, pssm1, rc_pssm2, args.min_overlap if args.shift else pssm1.length) # if cor < cor_rc: if p_value > p_value_rc: cor, p_value, offset, orientation = cor_rc, p_value_rc, offset_rc, "-" row.extend([cor, p_value, offset, orientation]) result_table.append(row) # write results to output file out_file_name = args.out_dir + "/correlation_motifs" + ("_extensively" if args.extensively else "") + ( "_rc" if args.rc else "") + ("_shift_min_overlap=" + str(args.min_overlap) if args.shift else "") + ".txt" with open(out_file_name, 'w') as csv_file: file_writer = csv.writer(csv_file, delimiter="\t") file_writer.writerow(["ID1", "ID2", "cor_pearson", "p_value_pearson", "offset_pearson", "orientation_pearson", "cor_spearman", "p_value_spearman", "offset_spearman", "orientation_spearman"]) for row in result_table: file_writer.writerow(row)