def readChunk( lines, chunk ): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.getTempFile(".") try: motif, part = re.match( ":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise P.PipelineError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info( "reading %s - %s" % (motif, part)) tmpfile2.write( "".join( lines[chunks[chunk]+1:chunks[chunk+1]]) ) tmpfile2.close() mast = MAST.parse( IOTools.openFile(tmpfile2.name, "r") ) os.unlink( tmpfile2.name ) return motif, part, mast
def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.get_temp_file(".") try: motif, part = re.match(":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise ValueError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(IOTools.open_file(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast
def loadMAST(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. Add columns for the control data as well. ''' tablename = P.to_table(outfile) tmpfile = P.get_temp_file(".") tmpfile.write(MAST.Match().header + "\tmotif\tcontig" "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end" "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end" "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n") lines = IOTools.open_file(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.get_temp_file(".") try: motif, part = re.match(":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise ValueError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(IOTools.open_file(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast def splitId(s, mode): '''split background match id has three parts: track _ id _ pos track might contain '_'. ''' d = match.id.split("_") if mode == "bg": return "_".join(d[:-2]), d[-2], d[-1] elif mode == "fg": return "_".join(d[:-1]), d[-1] for chunk in range(0, len(chunks) - 1, 2): motif_fg, part, mast_fg = readChunk(lines, chunk) assert part == "foreground" motif_bg, part, mast_bg = readChunk(lines, chunk + 1) assert part == "background" assert motif_fg == motif_bg # index control data controls = collections.defaultdict(dict) for match in mast_bg.matches: track, id, pos = splitId(match.id, "bg") controls[id][pos] = (match.evalue, match.pvalue, match.nmotifs, match.length, match.start, match.end) for match in mast_fg.matches: # remove track and pos track, match.id = splitId(match.id, "fg") # move to genomic coordinates contig, start, end = re.match("(\S+):(\d+)..(\d+)", match.description).groups() if match.nmotifs > 0: start, end = int(start), int(end) match.start += start match.end += start match.positions = [x + start for x in match.positions] id = match.id if id not in controls: P.warn("no controls for %s - increase MAST evalue" % id) if "l" not in controls[id]: controls[id]["l"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0, 0) if "r" not in controls[id]: controls[id]["r"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0, 0) min_evalue = min(controls[id]["l"][0], controls[id]["r"][0]) min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1]) max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2]) tmpfile.write( str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( motif_fg, contig, "\t".join(map(str, controls[id]["l"])), "\t".join(map(str, controls[id]["r"])), str(min_evalue), str(min_pvalue), str(max_nmatches), ) + "\n") tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)