예제 #1
0
def main():

	# parse the command line

	minMRatio  = None
	requireEof = True

	for arg in argv[1:]:
		if ("=" in arg):
			argVal = arg.split("=",1)[1]

		if (arg.startswith("--minmratio=")):
			minMRatio = parse_noise_rate(argVal)
			if (not (0.0 <= minMRatio <= 1.0)):
				exit("%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s"
				   % (os_path.basename(argv[0]),arg))
		elif (arg.startswith("--maxnoise=")):
			minMRatio = 1 - parse_noise_rate(argVal)
			if (not (0.0 <= minMRatio <= 1.0)):
				exit("%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s"
				   % (os_path.basename(argv[0]),arg))
		elif (arg in ["--noendmark","--noeof","--nomark"]):   # (unadvertised)
			requireEof = False
		elif (arg.startswith("--")):
			usage("unrecognized option: %s" % arg)
		else:
			usage("unrecognized option: %s" % arg)

	# process the alignments

	print "\t".join(["#line","seq","strand","start","end","querybp","mRatio","nErrors","errors"])

	for a in alignments(stdin,requireEof):
		if (minMRatio != None) and (a.mRatio < minMRatio):
			continue

		errorPositions = []
		for (ix,ch) in enumerate(a.errorText):
			if (ch == "x"):
				errorPositions += [float(ix)/(len(a.errorText)-1)]

		print "\t".join([str(a.lineNumber),
		                 a.seqName,a.strand,str(a.start),str(a.end),
		                 str(a.motifBaseCount),
		                 "%.1f%%" % (100*a.mRatio),
		                 str(len(errorPositions))]
		              + map(lambda x:"%.3f"%x,errorPositions))
예제 #2
0
def main():

    # parse the command line

    minMRatio = None
    requireEof = True

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg.startswith("--minmratio=")):
            minMRatio = parse_noise_rate(argVal)
            if (not (0.0 <= minMRatio <= 1.0)):
                exit(
                    "%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s"
                    % (os_path.basename(argv[0]), arg))
        elif (arg.startswith("--maxnoise=")):
            minMRatio = 1 - parse_noise_rate(argVal)
            if (not (0.0 <= minMRatio <= 1.0)):
                exit(
                    "%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s"
                    % (os_path.basename(argv[0]), arg))
        elif (arg in ["--noendmark", "--noeof", "--nomark"]):  # (unadvertised)
            requireEof = False
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            usage("unrecognized option: %s" % arg)

    # process the alignments

    for a in alignments(stdin, requireEof):
        if (minMRatio != None) and (a.mRatio < minMRatio):
            continue

        print "\t".join([
            a.seqName,
            str(a.start),
            str(a.end), ".",
            "%d" % (1000 * a.mRatio), a.strand
        ])
예제 #3
0
def main():
    global debug

    # parse the command line

    truthFilename = None
    alignmentTarget = "genome"
    detectionThresh = 0.95
    detailFilename = None
    reportOvercovered = False
    motifs = None
    debug = []

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg.startswith("--truth=")) or (arg.startswith("--catalog=")):
            if (truthFilename != None):
                usage("unrecognized option: %s" % arg)
            truthFilename = argVal
        elif (arg == "--genome"):
            alignmentTarget = "genome"
        elif (arg == "--reads"):
            alignmentTarget = "reads"
        elif (arg.startswith("--motif=")):
            if (motifs == None): motifs = Set()
            motifs.add(argVal)
        elif (arg.startswith("--detection=")):
            detectionThresh = parse_noise_rate(argVal)
            if (detectionThresh <= 0):
                usage("detection threshold must be positive (%s)" % arg)
            if (detectionThresh > 1):
                usage("detection threshold cannot be more than 100% (%s)" %
                      arg)
        elif (arg.startswith("--detail=")):
            detailFilename = argVal
        elif (arg == "--overcovered"):
            reportOvercovered = True
        elif (arg == "--debug"):
            debug += ["debug"]
        elif (arg.startswith("--debug=")):
            debug += argVal.split(",")
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        elif (truthFilename == None):
            truthFilename = arg
        else:
            usage("unrecognized option: %s" % arg)

    if (truthFilename == None):
        usage("you have to give me the truth file")

    # collect the truth

    if (alignmentTarget == "genome"):
        intervalColumns = (0, 1, 2, 3)
    else:  # if (alignmentTarget == "reads"):
        intervalColumns = (3, 4, 5, 6)

    chromOrder = []
    chromSeen = Set()

    truth = {}

    f = file(truthFilename, "rt")
    for (chrom, start, end, motif) in read_intervals(f, intervalColumns,
                                                     truthFilename):
        if (motifs != None):
            if (alignmentTarget == "genome"):
                motif = motif.split(".")[0]
            else:  # if (alignmentTarget == "reads"):
                if (motif[-1] in ["+", "-"]): motif = motif[:-1]
            if (motif not in motifs): continue
        if ("truth" in debug):
            print >> stderr, "truth: %s %d %d %s" % (chrom, start, end, motif)
        if (chrom not in chromSeen):
            chromOrder += [chrom]
            chromSeen.add(chrom)
        if (chrom not in truth): truth[chrom] = []
        truth[chrom] += [(start, end)]
    f.close()

    for chrom in truth:
        truth[chrom].sort()

    for chrom in truth:
        if ("overlap" in debug):
            print >> stderr, "on %s:" % chrom
            for (s, e) in truth[chrom]:
                print >> stderr, "  %d..%d" % (s, e)

        (prevStart, prevEnd) = truth[chrom][0]
        for (s, e) in truth[chrom][1:]:
            assert (s >= prevEnd), \
                   "on %s, truth interval %d..%d overlaps %d..%d" \
                 % (chrom,prevStart,prevEnd,s,e)
            (prevStart, prevEnd) = (s, e)

    # collect the observations
    # nb: we allow overlaps in this (but separate them), so we can check for
    #     over-covered intervals

    observed = {}
    overlaps = {}

    for (chrom, start, end, motif) in read_intervals(stdin, (2, 3, 4, 1)):
        if (motifs != None):
            if (motif not in motifs): continue
        if ("observations" in debug):
            print >> stderr, "observation: %s %d %d %s" % (chrom, start, end,
                                                           motif)
        if (chrom not in chromSeen):
            chromOrder += [chrom]
            chromSeen.add(chrom)
        if (chrom not in observed): observed[chrom] = []
        observed[chrom] += [(start, end)]

    for chrom in observed:
        (observed[chrom], overlaps[chrom]) = merge_intervals(observed[chrom])
        if ("overlap" in debug):
            for (start, end) in overlaps[chrom]:
                print >>stderr, "observation overlap: %s:%d..%d" \
                              % (chrom,start,end)
        if (not reportOvercovered):
            assert (overlaps[chrom] == []), \
                   "observations for %s contain overlaps on %s:\n%s" \
                 % (motif,chrom,
                    "\n".join(["%d..%d"%(start,end) for (start,end) in overlaps[chrom]]))

    # compute true positives and false positives for each observed interval
    #
    # See reference [1].

    pTotal = tpTotal = fpTotal = 0
    for chrom in chromOrder:
        if (chrom not in observed): continue

        truthOnChrom = None
        if (chrom in truth):
            truthOnChrom = truth[chrom]

        for (start, end) in observed[chrom]:
            length = end - start
            tp = 0
            if (truthOnChrom != None):
                tp = overlap_count(start, end, truthOnChrom)
            fp = length - tp
            pTotal += length
            tpTotal += tp
            fpTotal += fp

    # compute false negatives, and per-observed-interval true positives

    perInterval = []

    fnTotal = truthTotal = 0
    detected = 0
    for chrom in chromOrder:
        if (chrom not in truth): continue

        observedOnChrom = None
        if (chrom in observed):
            observedOnChrom = observed[chrom]

        for (start, end) in truth[chrom]:
            length = end - start
            tp = 0
            if (observedOnChrom != None):
                tp = overlap_count(start, end, observedOnChrom)
            fn = length - tp
            fnTotal += fn
            truthTotal += length
            perInterval += [(chrom, start, end, tp)]
            if (tp >= length *
                    detectionThresh):  # (tp/length >= detectionThresh)
                detected += 1

    # compute over-covered (as true positives and false positives)

    if (reportOvercovered):
        tpoTotal = fpoTotal = 0
        for chrom in chromOrder:
            if (chrom not in overlaps): continue

            truthOnChrom = None
            if (chrom in truth):
                truthOnChrom = truth[chrom]

            for (start, end) in overlaps[chrom]:
                length = end - start
                tp = 0
                if (truthOnChrom != None):
                    tp = overlap_count(start, end, truthOnChrom)
                fp = length - tp
                tpoTotal += tp
                fpoTotal += fp

    # report

    if (tpTotal + fnTotal == 0):
        print "%s\t%s/%s\tNA" \
            % ("TPR",tpTotal,tpTotal+fnTotal)
        print "%s\t%s/%s\tNA" \
            % ("FNR",fnTotal,tpTotal+fnTotal)
    else:
        print "%s\t%s/%s\t%5.3f%%" \
            % ("TPR",tpTotal,tpTotal+fnTotal,100.0*tpTotal/(tpTotal+fnTotal))
        print "%s\t%s/%s\t%5.3f%%" \
            % ("FNR",fnTotal,tpTotal+fnTotal,100.0*fnTotal/(tpTotal+fnTotal))

    if (tpTotal + fpTotal == 0):
        print "%s\t%s/%s\tNA" \
            % ("PPV",tpTotal,tpTotal+fpTotal)
        print "%s\t%s/%s\tNA" \
            % ("FDR",fpTotal,tpTotal+fpTotal)
    else:
        print "%s\t%s/%s\t%5.3f%%" \
            % ("PPV",tpTotal,tpTotal+fpTotal,100.0*tpTotal/(tpTotal+fpTotal))
        print "%s\t%s/%s\t%5.3f%%" \
            % ("FDR",fpTotal,tpTotal+fpTotal,100.0*fpTotal/(tpTotal+fpTotal))

    if (len(perInterval) == 0):
        print "%s\t%s/%s\tNA" \
            % ("DETECTED",detected,len(perInterval))
    else:
        print "%s\t%s/%s\t%5.3f%%" \
            % ("DETECTED",detected,len(perInterval),100.0*detected/len(perInterval))

    if (reportOvercovered):
        print "%s\t%s" \
            % ("TP-OVERCOVERED",tpoTotal)
        print "%s\t%s" \
            % ("FP-OVERCOVERED",fpoTotal)

    if (detailFilename != None):
        if (alignmentTarget == "genome"): chromName = "chrom"
        else: chromName = "read"

        f = file(detailFilename, "wt")
        print >>f, "#%s\t%s\t%s\t%s/%s\t%s" \
              % (chromName,"start","end","TP","TP+FN","TPR")
        for (chrom, start, end, tp) in perInterval:
            length = end - start
            print >>f, "%s\t%s\t%s\t%s/%s\t%5.1f%%" \
                  % (chrom,start,end,tp,length,100.0*tp/length)
        f.close()
예제 #4
0
def main():

    # parse the command line

    minMRatio = None
    requireEof = True

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg.startswith("--minmratio=")):
            minMRatio = parse_noise_rate(argVal)
            if (not (0.0 <= minMRatio <= 1.0)):
                exit(
                    "%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s"
                    % (os_path.basename(argv[0]), arg))
        elif (arg.startswith("--maxnoise=")):
            minMRatio = 1 - parse_noise_rate(argVal)
            if (not (0.0 <= minMRatio <= 1.0)):
                exit(
                    "%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s"
                    % (os_path.basename(argv[0]), arg))
        elif (arg in ["--noendmark", "--noeof", "--nomark"]):  # (unadvertised)
            requireEof = False
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            usage("unrecognized option: %s" % arg)

    # process the alignments

    print "\t".join([
        "#line", "motif", "seq", "start", "end", "strand", "seqLen", "querybp",
        "consensus"
    ])

    userHasntBeenWarned = True
    for a in alignments(stdin, requireEof):
        seqLenStr = "NA"
        if (a.seqLen != None):
            seqLenStr = str(a.seqLen)

        if (hasattr(a, "mRatio")):
            if (minMRatio != None) and (a.mRatio < minMRatio):
                continue

        consensuses = []
        for line in a.lines:
            if (not line.startswith("# consensus")): continue
            line = line.split(None, 2)
            consensuses += [line[2]]

        if (consensuses == []):
            if (userHasntBeenWarned):
                print >>stderr, \
                                 ("WARNING: input alignments did not contain a consensus line"
                                + "\n         (ncrf_consensus_filter would create that line)")
                userHasntBeenWarned = False
            consensus = "(missing)"
        else:
            consensus = ",".join(consensuses)

        print "\t".join([
            str(a.lineNumber), a.motif, a.seqName,
            str(a.start),
            str(a.end), a.strand, seqLenStr,
            str(a.motifBaseCount), consensus
        ])
예제 #5
0
def main():

	# parse the command line

	minMRatio  = None
	requireEof = True

	for arg in argv[1:]:
		if ("=" in arg):
			argVal = arg.split("=",1)[1]

		if (arg.startswith("--minmratio=")):
			minMRatio = parse_noise_rate(argVal)
			if (not (0.0 <= minMRatio <= 1.0)):
				exit("%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s"
				   % (os_path.basename(argv[0]),arg))
		elif (arg.startswith("--maxnoise=")):
			minMRatio = 1 - parse_noise_rate(argVal)
			if (not (0.0 <= minMRatio <= 1.0)):
				exit("%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s"
				   % (os_path.basename(argv[0]),arg))
		elif (arg in ["--noendmark","--noeof","--nomark"]):   # (unadvertised)
			requireEof = False
		elif (arg.startswith("--")):
			usage("unrecognized option: %s" % arg)
		else:
			usage("unrecognized option: %s" % arg)

	# process the alignments

	print "\t".join(["#line","motif",
	                 "seq","start","end","strand",
	                 "seqLen","querybp",
	                 "mRatio","m","mm","i","d"])

	userHasntBeenWarned = True
	for a in alignments(stdin,requireEof):
		seqLenStr      = "NA"
		mRatioStr      = "NA"
		nMatchStr      = "NA"
		nMismatchStr   = "NA"
		nInsertionsStr = "NA"
		nDeletionsStr  = "NA"

		if (a.seqLen != None):
			seqLenStr = str(a.seqLen)

		if (hasattr(a,"mRatio")):
			if (minMRatio != None) and (a.mRatio < minMRatio):
				continue
			mRatioStr = "%.1f%%" % (100*a.mRatio)

		if (hasattr(a,"nMatch")):
			nMatchStr = str(a.nMatch)

		if (hasattr(a,"nMismatch")):
			nMismatchStr = str(a.nMismatch)

		if (hasattr(a,"nInsertions")):
			nInsertionsStr = str(a.nInsertions)

		if (hasattr(a,"nDeletions")):
			nDeletionsStr = str(a.nDeletions)

		if (mRatioStr == "NA"):
			if (userHasntBeenWarned):
				print >>stderr, \
                     ("WARNING: input alignments did not contain an event stats line"
                    + "\n         (NCRF --stats=events would create that line)")
				userHasntBeenWarned = False

		print "\t".join([str(a.lineNumber),
		                 a.motif,
		                 a.seqName,str(a.start),str(a.end),a.strand,
		                 seqLenStr,str(a.motifBaseCount),
		                 mRatioStr,
		                 nMatchStr,nMismatchStr,nInsertionsStr,nDeletionsStr])
예제 #6
0
def main():
	global debug

	# parse the command line

	maxMRatio    = 0.85
	minColumns   = 10
	headLimit    = None
	reportClumps = False
	requireEof   = True
	debug        = []

	for arg in argv[1:]:
		if ("=" in arg):
			argVal = arg.split("=",1)[1]

		if (arg.startswith("--maxMRatio=")):
			maxMRatio = parse_noise_rate(argVal)
			if (not (0.0 <= minMRatio <= 1.0)):
				exit("%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s"
				   % (os_path.basename(argv[0]),arg))
		elif (arg.startswith("--minnoise=")):
			maxMRatio = 1 - parse_noise_rate(argVal)
			if (not (0.0 <= minMRatio <= 1.0)):
				exit("%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s"
				   % (os_path.basename(argv[0]),arg))
		elif (arg.startswith("--mincolumns=")) or (arg.startswith("--mindenom=")):
			minColumns =int(argVal)
			if (minColumns < 2):
				usage("minimum length has to be at least two columns\n%s" % arg)
		elif (arg.startswith("--head=")):
			headLimit = int_with_unit(argVal)
		elif (arg == "--report:clumps") or (arg == "--report=clumps"):
			reportClumps = True
		elif (arg in ["--noendmark","--noeof","--nomark"]):   # (unadvertised)
			requireEof = False
		elif (arg == "--debug"):
			debug += ["debug"]
		elif (arg.startswith("--debug=")):
			debug += argVal.split(",")
		elif (arg.startswith("--")):
			usage("unrecognized option: %s" % arg)
		else:
			usage("unrecognized option: %s" % arg)

	# process the alignments

	alignmentNum = 0
	for a in alignments(stdin,requireEof):
		alignmentNum +=1 

		if (headLimit != None) and (alignmentNum > headLimit):
			print >>stderr, "limit of %d alignments reached" % headLimit
			break

		if (a.errorText == None):
			exit("%s: alignment at line %d doesn't include error text"
			   % (os_path.basename(argv[0]),a.lineNumber))

		if ("detail" in debug):
			print >>stderr, "\nlooking for clumps in %s %c %u-%u" \
			              % (a.seqName,a.strand,a.start,a.end)

		clumps = find_clumps(a.errorText,1-maxMRatio,minColumns,
		                     positiveCh='x',negativeCh='=')

		clumpText = ["-"] * len(a.errorText)
		for (start,end) in clumps:
			for ix in xrange(start,end): clumpText[ix] = "*"
		clumpText = "".join(clumpText)

		prefixLen = 1 + a.lines[0].find(" =")
		if (prefixLen < 0):
			prefixLen = 1 + a.lines[0].find(" x")

		if (alignmentNum > 1): print
		a.lines.insert(3,"# %-*s%s" % (prefixLen-2,"noise clumps",clumpText))
		print a

		if (reportClumps):
			for (start,end) in clumps:
				errorCount = matchCount = 0
				for ch in a.errorText[start:end]:
					if   (ch == 'x'): errorCount += 1
					elif (ch == '='): matchCount += 1
				print >>stderr, "line %d (%d,%d) m=%s x=%s mRatio: %.2f%%" \
							  % (a.lineNumber,
							     start,end,matchCount,errorCount,
								 (100.0*matchCount)/(matchCount+errorCount))

	if (requireEof):
		print "# ncrf end-of-file"