def main(): # parse the command line minMRatio = None requireEof = True for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg.startswith("--minmratio=")): minMRatio = parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit("%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s" % (os_path.basename(argv[0]),arg)) elif (arg.startswith("--maxnoise=")): minMRatio = 1 - parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit("%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s" % (os_path.basename(argv[0]),arg)) elif (arg in ["--noendmark","--noeof","--nomark"]): # (unadvertised) requireEof = False elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments print "\t".join(["#line","seq","strand","start","end","querybp","mRatio","nErrors","errors"]) for a in alignments(stdin,requireEof): if (minMRatio != None) and (a.mRatio < minMRatio): continue errorPositions = [] for (ix,ch) in enumerate(a.errorText): if (ch == "x"): errorPositions += [float(ix)/(len(a.errorText)-1)] print "\t".join([str(a.lineNumber), a.seqName,a.strand,str(a.start),str(a.end), str(a.motifBaseCount), "%.1f%%" % (100*a.mRatio), str(len(errorPositions))] + map(lambda x:"%.3f"%x,errorPositions))
def main(): # parse the command line minMRatio = None requireEof = True for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--minmratio=")): minMRatio = parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit( "%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s" % (os_path.basename(argv[0]), arg)) elif (arg.startswith("--maxnoise=")): minMRatio = 1 - parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit( "%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s" % (os_path.basename(argv[0]), arg)) elif (arg in ["--noendmark", "--noeof", "--nomark"]): # (unadvertised) requireEof = False elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments for a in alignments(stdin, requireEof): if (minMRatio != None) and (a.mRatio < minMRatio): continue print "\t".join([ a.seqName, str(a.start), str(a.end), ".", "%d" % (1000 * a.mRatio), a.strand ])
def main(): global debug # parse the command line truthFilename = None alignmentTarget = "genome" detectionThresh = 0.95 detailFilename = None reportOvercovered = False motifs = None debug = [] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--truth=")) or (arg.startswith("--catalog=")): if (truthFilename != None): usage("unrecognized option: %s" % arg) truthFilename = argVal elif (arg == "--genome"): alignmentTarget = "genome" elif (arg == "--reads"): alignmentTarget = "reads" elif (arg.startswith("--motif=")): if (motifs == None): motifs = Set() motifs.add(argVal) elif (arg.startswith("--detection=")): detectionThresh = parse_noise_rate(argVal) if (detectionThresh <= 0): usage("detection threshold must be positive (%s)" % arg) if (detectionThresh > 1): usage("detection threshold cannot be more than 100% (%s)" % arg) elif (arg.startswith("--detail=")): detailFilename = argVal elif (arg == "--overcovered"): reportOvercovered = True elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) elif (truthFilename == None): truthFilename = arg else: usage("unrecognized option: %s" % arg) if (truthFilename == None): usage("you have to give me the truth file") # collect the truth if (alignmentTarget == "genome"): intervalColumns = (0, 1, 2, 3) else: # if (alignmentTarget == "reads"): intervalColumns = (3, 4, 5, 6) chromOrder = [] chromSeen = Set() truth = {} f = file(truthFilename, "rt") for (chrom, start, end, motif) in read_intervals(f, intervalColumns, truthFilename): if (motifs != None): if (alignmentTarget == "genome"): motif = motif.split(".")[0] else: # if (alignmentTarget == "reads"): if (motif[-1] in ["+", "-"]): motif = motif[:-1] if (motif not in motifs): continue if ("truth" in debug): print >> stderr, "truth: %s %d %d %s" % (chrom, start, end, motif) if (chrom not in chromSeen): chromOrder += [chrom] chromSeen.add(chrom) if (chrom not in truth): truth[chrom] = [] truth[chrom] += [(start, end)] f.close() for chrom in truth: truth[chrom].sort() for chrom in truth: if ("overlap" in debug): print >> stderr, "on %s:" % chrom for (s, e) in truth[chrom]: print >> stderr, " %d..%d" % (s, e) (prevStart, prevEnd) = truth[chrom][0] for (s, e) in truth[chrom][1:]: assert (s >= prevEnd), \ "on %s, truth interval %d..%d overlaps %d..%d" \ % (chrom,prevStart,prevEnd,s,e) (prevStart, prevEnd) = (s, e) # collect the observations # nb: we allow overlaps in this (but separate them), so we can check for # over-covered intervals observed = {} overlaps = {} for (chrom, start, end, motif) in read_intervals(stdin, (2, 3, 4, 1)): if (motifs != None): if (motif not in motifs): continue if ("observations" in debug): print >> stderr, "observation: %s %d %d %s" % (chrom, start, end, motif) if (chrom not in chromSeen): chromOrder += [chrom] chromSeen.add(chrom) if (chrom not in observed): observed[chrom] = [] observed[chrom] += [(start, end)] for chrom in observed: (observed[chrom], overlaps[chrom]) = merge_intervals(observed[chrom]) if ("overlap" in debug): for (start, end) in overlaps[chrom]: print >>stderr, "observation overlap: %s:%d..%d" \ % (chrom,start,end) if (not reportOvercovered): assert (overlaps[chrom] == []), \ "observations for %s contain overlaps on %s:\n%s" \ % (motif,chrom, "\n".join(["%d..%d"%(start,end) for (start,end) in overlaps[chrom]])) # compute true positives and false positives for each observed interval # # See reference [1]. pTotal = tpTotal = fpTotal = 0 for chrom in chromOrder: if (chrom not in observed): continue truthOnChrom = None if (chrom in truth): truthOnChrom = truth[chrom] for (start, end) in observed[chrom]: length = end - start tp = 0 if (truthOnChrom != None): tp = overlap_count(start, end, truthOnChrom) fp = length - tp pTotal += length tpTotal += tp fpTotal += fp # compute false negatives, and per-observed-interval true positives perInterval = [] fnTotal = truthTotal = 0 detected = 0 for chrom in chromOrder: if (chrom not in truth): continue observedOnChrom = None if (chrom in observed): observedOnChrom = observed[chrom] for (start, end) in truth[chrom]: length = end - start tp = 0 if (observedOnChrom != None): tp = overlap_count(start, end, observedOnChrom) fn = length - tp fnTotal += fn truthTotal += length perInterval += [(chrom, start, end, tp)] if (tp >= length * detectionThresh): # (tp/length >= detectionThresh) detected += 1 # compute over-covered (as true positives and false positives) if (reportOvercovered): tpoTotal = fpoTotal = 0 for chrom in chromOrder: if (chrom not in overlaps): continue truthOnChrom = None if (chrom in truth): truthOnChrom = truth[chrom] for (start, end) in overlaps[chrom]: length = end - start tp = 0 if (truthOnChrom != None): tp = overlap_count(start, end, truthOnChrom) fp = length - tp tpoTotal += tp fpoTotal += fp # report if (tpTotal + fnTotal == 0): print "%s\t%s/%s\tNA" \ % ("TPR",tpTotal,tpTotal+fnTotal) print "%s\t%s/%s\tNA" \ % ("FNR",fnTotal,tpTotal+fnTotal) else: print "%s\t%s/%s\t%5.3f%%" \ % ("TPR",tpTotal,tpTotal+fnTotal,100.0*tpTotal/(tpTotal+fnTotal)) print "%s\t%s/%s\t%5.3f%%" \ % ("FNR",fnTotal,tpTotal+fnTotal,100.0*fnTotal/(tpTotal+fnTotal)) if (tpTotal + fpTotal == 0): print "%s\t%s/%s\tNA" \ % ("PPV",tpTotal,tpTotal+fpTotal) print "%s\t%s/%s\tNA" \ % ("FDR",fpTotal,tpTotal+fpTotal) else: print "%s\t%s/%s\t%5.3f%%" \ % ("PPV",tpTotal,tpTotal+fpTotal,100.0*tpTotal/(tpTotal+fpTotal)) print "%s\t%s/%s\t%5.3f%%" \ % ("FDR",fpTotal,tpTotal+fpTotal,100.0*fpTotal/(tpTotal+fpTotal)) if (len(perInterval) == 0): print "%s\t%s/%s\tNA" \ % ("DETECTED",detected,len(perInterval)) else: print "%s\t%s/%s\t%5.3f%%" \ % ("DETECTED",detected,len(perInterval),100.0*detected/len(perInterval)) if (reportOvercovered): print "%s\t%s" \ % ("TP-OVERCOVERED",tpoTotal) print "%s\t%s" \ % ("FP-OVERCOVERED",fpoTotal) if (detailFilename != None): if (alignmentTarget == "genome"): chromName = "chrom" else: chromName = "read" f = file(detailFilename, "wt") print >>f, "#%s\t%s\t%s\t%s/%s\t%s" \ % (chromName,"start","end","TP","TP+FN","TPR") for (chrom, start, end, tp) in perInterval: length = end - start print >>f, "%s\t%s\t%s\t%s/%s\t%5.1f%%" \ % (chrom,start,end,tp,length,100.0*tp/length) f.close()
def main(): # parse the command line minMRatio = None requireEof = True for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg.startswith("--minmratio=")): minMRatio = parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit( "%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s" % (os_path.basename(argv[0]), arg)) elif (arg.startswith("--maxnoise=")): minMRatio = 1 - parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit( "%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s" % (os_path.basename(argv[0]), arg)) elif (arg in ["--noendmark", "--noeof", "--nomark"]): # (unadvertised) requireEof = False elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments print "\t".join([ "#line", "motif", "seq", "start", "end", "strand", "seqLen", "querybp", "consensus" ]) userHasntBeenWarned = True for a in alignments(stdin, requireEof): seqLenStr = "NA" if (a.seqLen != None): seqLenStr = str(a.seqLen) if (hasattr(a, "mRatio")): if (minMRatio != None) and (a.mRatio < minMRatio): continue consensuses = [] for line in a.lines: if (not line.startswith("# consensus")): continue line = line.split(None, 2) consensuses += [line[2]] if (consensuses == []): if (userHasntBeenWarned): print >>stderr, \ ("WARNING: input alignments did not contain a consensus line" + "\n (ncrf_consensus_filter would create that line)") userHasntBeenWarned = False consensus = "(missing)" else: consensus = ",".join(consensuses) print "\t".join([ str(a.lineNumber), a.motif, a.seqName, str(a.start), str(a.end), a.strand, seqLenStr, str(a.motifBaseCount), consensus ])
def main(): # parse the command line minMRatio = None requireEof = True for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg.startswith("--minmratio=")): minMRatio = parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit("%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s" % (os_path.basename(argv[0]),arg)) elif (arg.startswith("--maxnoise=")): minMRatio = 1 - parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit("%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s" % (os_path.basename(argv[0]),arg)) elif (arg in ["--noendmark","--noeof","--nomark"]): # (unadvertised) requireEof = False elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments print "\t".join(["#line","motif", "seq","start","end","strand", "seqLen","querybp", "mRatio","m","mm","i","d"]) userHasntBeenWarned = True for a in alignments(stdin,requireEof): seqLenStr = "NA" mRatioStr = "NA" nMatchStr = "NA" nMismatchStr = "NA" nInsertionsStr = "NA" nDeletionsStr = "NA" if (a.seqLen != None): seqLenStr = str(a.seqLen) if (hasattr(a,"mRatio")): if (minMRatio != None) and (a.mRatio < minMRatio): continue mRatioStr = "%.1f%%" % (100*a.mRatio) if (hasattr(a,"nMatch")): nMatchStr = str(a.nMatch) if (hasattr(a,"nMismatch")): nMismatchStr = str(a.nMismatch) if (hasattr(a,"nInsertions")): nInsertionsStr = str(a.nInsertions) if (hasattr(a,"nDeletions")): nDeletionsStr = str(a.nDeletions) if (mRatioStr == "NA"): if (userHasntBeenWarned): print >>stderr, \ ("WARNING: input alignments did not contain an event stats line" + "\n (NCRF --stats=events would create that line)") userHasntBeenWarned = False print "\t".join([str(a.lineNumber), a.motif, a.seqName,str(a.start),str(a.end),a.strand, seqLenStr,str(a.motifBaseCount), mRatioStr, nMatchStr,nMismatchStr,nInsertionsStr,nDeletionsStr])
def main(): global debug # parse the command line maxMRatio = 0.85 minColumns = 10 headLimit = None reportClumps = False requireEof = True debug = [] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg.startswith("--maxMRatio=")): maxMRatio = parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit("%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s" % (os_path.basename(argv[0]),arg)) elif (arg.startswith("--minnoise=")): maxMRatio = 1 - parse_noise_rate(argVal) if (not (0.0 <= minMRatio <= 1.0)): exit("%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s" % (os_path.basename(argv[0]),arg)) elif (arg.startswith("--mincolumns=")) or (arg.startswith("--mindenom=")): minColumns =int(argVal) if (minColumns < 2): usage("minimum length has to be at least two columns\n%s" % arg) elif (arg.startswith("--head=")): headLimit = int_with_unit(argVal) elif (arg == "--report:clumps") or (arg == "--report=clumps"): reportClumps = True elif (arg in ["--noendmark","--noeof","--nomark"]): # (unadvertised) requireEof = False elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) # process the alignments alignmentNum = 0 for a in alignments(stdin,requireEof): alignmentNum +=1 if (headLimit != None) and (alignmentNum > headLimit): print >>stderr, "limit of %d alignments reached" % headLimit break if (a.errorText == None): exit("%s: alignment at line %d doesn't include error text" % (os_path.basename(argv[0]),a.lineNumber)) if ("detail" in debug): print >>stderr, "\nlooking for clumps in %s %c %u-%u" \ % (a.seqName,a.strand,a.start,a.end) clumps = find_clumps(a.errorText,1-maxMRatio,minColumns, positiveCh='x',negativeCh='=') clumpText = ["-"] * len(a.errorText) for (start,end) in clumps: for ix in xrange(start,end): clumpText[ix] = "*" clumpText = "".join(clumpText) prefixLen = 1 + a.lines[0].find(" =") if (prefixLen < 0): prefixLen = 1 + a.lines[0].find(" x") if (alignmentNum > 1): print a.lines.insert(3,"# %-*s%s" % (prefixLen-2,"noise clumps",clumpText)) print a if (reportClumps): for (start,end) in clumps: errorCount = matchCount = 0 for ch in a.errorText[start:end]: if (ch == 'x'): errorCount += 1 elif (ch == '='): matchCount += 1 print >>stderr, "line %d (%d,%d) m=%s x=%s mRatio: %.2f%%" \ % (a.lineNumber, start,end,matchCount,errorCount, (100.0*matchCount)/(matchCount+errorCount)) if (requireEof): print "# ncrf end-of-file"