def loadGLAM2SCAN(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. ''' tablename = outfile[:-len(".load")] tmpfile = tempfile.NamedTemporaryFile(delete=False) tmpfile.write( "motif\tid\tnmatches\tscore\tscores\tncontrols\tmax_controls\n") lines = IOTools.openFile(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) for chunk in range(len(chunks) - 1): # use real file, as parser can not deal with a # list of lines try: motif = re.match( ":: motif = (\S+) ::", lines[chunks[chunk]]).groups()[0] except AttributeError: raise P.PipelineError( "parsing error in line '%s'" % lines[chunks[chunk]]) if chunks[chunk] + 1 == chunks[chunk + 1]: L.warn("no results for motif %s - ignored" % motif) continue tmpfile2 = tempfile.NamedTemporaryFile(delete=False) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() glam = Glam2Scan.parse(IOTools.openFile(tmpfile2.name, "r")) os.unlink(tmpfile2.name) # collect control data full_matches = collections.defaultdict(list) controls = collections.defaultdict(list) for match in glam.matches: m = match.id.split("_") track, id = m[:2] if len(m) == 2: full_matches[id].append(match) else: controls[id].append(match.score) for id, matches in full_matches.iteritems(): nmatches = len(matches) scores = [x.score for x in matches] score = max(scores) # move to genomic coordinates #contig, start, end = re.match( "(\S+):(\d+)..(\d+)", match.id).groups() #start, end = int(start), int(end) #match.start += start #match.end += start contig = "" if id not in controls: P.warn("no controls for %s - increase evalue?" % id) c = controls[id] if len(c) == 0: mmax = "" else: mmax = max(c) tmpfile.write("\t".join(map(str, (motif, id, nmatches, score, ",".join(map(str, scores)), len(c), mmax))) + "\n") tmpfile.close() tmpfilename = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ -b sqlite \ --index=id \ --index=motif \ --index=id,motif \ --table=%(tablename)s \ --map=base_qualities:text \ < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfile.name)
def loadGLAM2SCAN(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. ''' tmpfile = tempfile.NamedTemporaryFile(delete=False) tmpfile.write( "motif\tid\tnmatches\tscore\tscores\tncontrols\tmax_controls\n") lines = IOTools.openFile(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) for chunk in range(len(chunks) - 1): # use real file, as parser can not deal with a # list of lines try: motif = re.match(":: motif = (\S+) ::", lines[chunks[chunk]]).groups()[0] except AttributeError: raise ValueError("parsing error in line '%s'" % lines[chunks[chunk]]) if chunks[chunk] + 1 == chunks[chunk + 1]: L.warn("no results for motif %s - ignored" % motif) continue tmpfile2 = tempfile.NamedTemporaryFile(delete=False) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() glam = Glam2Scan.parse(IOTools.openFile(tmpfile2.name, "r")) os.unlink(tmpfile2.name) # collect control data full_matches = collections.defaultdict(list) controls = collections.defaultdict(list) for match in glam.matches: m = match.id.split("_") track, id = m[:2] if len(m) == 2: full_matches[id].append(match) else: controls[id].append(match.score) for id, matches in full_matches.items(): nmatches = len(matches) scores = [x.score for x in matches] score = max(scores) # move to genomic coordinates # contig, start, end = re.match( "(\S+):(\d+)..(\d+)", match.id).groups() # start, end = int(start), int(end) # match.start += start # match.end += start contig = "" if id not in controls: P.warn("no controls for %s - increase evalue?" % id) c = controls[id] if len(c) == 0: mmax = "" else: mmax = max(c) tmpfile.write("\t".join( map(str, (motif, id, nmatches, score, ",".join(map(str, scores)), len(c), mmax))) + "\n") tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)