Exemplo n.º 1
0
def loadGLAM2SCAN(infile, outfile):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.
    '''
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    tmpfile.write(
        "motif\tid\tnmatches\tscore\tscores\tncontrols\tmax_controls\n")

    lines = iotools.open_file(infile).readlines()
    chunks = [x for x in range(len(lines)) if lines[x].startswith("::")]
    chunks.append(len(lines))

    for chunk in range(len(chunks) - 1):

        # use real file, as parser can not deal with a
        # list of lines

        try:
            motif = re.match(":: motif = (\S+) ::",
                             lines[chunks[chunk]]).groups()[0]
        except AttributeError:
            raise ValueError("parsing error in line '%s'" %
                             lines[chunks[chunk]])

        if chunks[chunk] + 1 == chunks[chunk + 1]:
            L.warn("no results for motif %s - ignored" % motif)
            continue

        tmpfile2 = tempfile.NamedTemporaryFile(delete=False)
        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()
        glam = Glam2Scan.parse(iotools.open_file(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        # collect control data
        full_matches = collections.defaultdict(list)
        controls = collections.defaultdict(list)
        for match in glam.matches:
            m = match.id.split("_")
            track, id = m[:2]
            if len(m) == 2:
                full_matches[id].append(match)
            else:
                controls[id].append(match.score)

        for id, matches in full_matches.items():

            nmatches = len(matches)
            scores = [x.score for x in matches]
            score = max(scores)
            # move to genomic coordinates
            # contig, start, end = re.match( "(\S+):(\d+)..(\d+)", match.id).groups()
            # start, end = int(start), int(end)
            # match.start += start
            # match.end += start
            contig = ""

            if id not in controls:
                P.warn("no controls for %s - increase evalue?" % id)

            c = controls[id]
            if len(c) == 0:
                mmax = ""
            else:
                mmax = max(c)

            tmpfile.write("\t".join(
                map(str, (motif, id, nmatches, score,
                          ",".join(map(str, scores)), len(c), mmax))) + "\n")

    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)
Exemplo n.º 2
0
def loadMAST(infile, outfile):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.

    Add columns for the control data as well.
    '''

    tablename = P.to_table(outfile)

    tmpfile = P.get_temp_file(".")

    tmpfile.write(MAST.Match().header + "\tmotif\tcontig"
                  "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end"
                  "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end"
                  "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n")

    lines = iotools.open_file(infile).readlines()
    chunks = [x for x in range(len(lines)) if lines[x].startswith("::")]
    chunks.append(len(lines))

    def readChunk(lines, chunk):
        # use real file, as MAST parser can not deal with a
        # list of lines
        tmpfile2 = P.get_temp_file(".")
        try:
            motif, part = re.match(":: motif = (\S+) - (\S+) ::",
                                   lines[chunks[chunk]]).groups()
        except AttributeError:
            raise ValueError("parsing error in line '%s'" %
                             lines[chunks[chunk]])

        E.info("reading %s - %s" % (motif, part))

        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()

        mast = MAST.parse(iotools.open_file(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        return motif, part, mast

    def splitId(s, mode):
        '''split background match id

        has three parts: track _ id _ pos

        track might contain '_'.
        '''
        d = match.id.split("_")
        if mode == "bg":
            return "_".join(d[:-2]), d[-2], d[-1]
        elif mode == "fg":
            return "_".join(d[:-1]), d[-1]

    for chunk in range(0, len(chunks) - 1, 2):

        motif_fg, part, mast_fg = readChunk(lines, chunk)
        assert part == "foreground"
        motif_bg, part, mast_bg = readChunk(lines, chunk + 1)
        assert part == "background"
        assert motif_fg == motif_bg

        # index control data
        controls = collections.defaultdict(dict)
        for match in mast_bg.matches:
            track, id, pos = splitId(match.id, "bg")
            controls[id][pos] = (match.evalue, match.pvalue, match.nmotifs,
                                 match.length, match.start, match.end)

        for match in mast_fg.matches:
            # remove track and pos
            track, match.id = splitId(match.id, "fg")
            # move to genomic coordinates
            contig, start, end = re.match("(\S+):(\d+)..(\d+)",
                                          match.description).groups()
            if match.nmotifs > 0:
                start, end = int(start), int(end)
                match.start += start
                match.end += start
                match.positions = [x + start for x in match.positions]

            id = match.id
            if id not in controls:
                P.warn("no controls for %s - increase MAST evalue" % id)

            if "l" not in controls[id]:
                controls[id]["l"] = (float(P.get_params()["mast_evalue"]), 1,
                                     0, 0, 0, 0)
            if "r" not in controls[id]:
                controls[id]["r"] = (float(P.get_params()["mast_evalue"]), 1,
                                     0, 0, 0, 0)

            min_evalue = min(controls[id]["l"][0], controls[id]["r"][0])
            min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1])
            max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2])

            tmpfile.write(
                str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                    motif_fg,
                    contig,
                    "\t".join(map(str, controls[id]["l"])),
                    "\t".join(map(str, controls[id]["r"])),
                    str(min_evalue),
                    str(min_pvalue),
                    str(max_nmatches),
                ) + "\n")

    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)
Exemplo n.º 3
0
def printTracks(infile, outfile):
    P.warn("\n\n\n\nprinting tracks:")
    for track in EXPERIMENTS:
        print("\t")
        print(track)