Exemplo n.º 1
0
def main():

    # parse the command line

    headLimit = None
    requireEof = True

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg.startswith("--head=")):
            headLimit = int_with_unit(argVal)
        elif (arg in ["--noendmark", "--noeof", "--nomark"]):  # (unadvertised)
            requireEof = False
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            usage("unrecognized option: %s" % arg)

    # process the alignments

    alignmentNum = 0
    for a in alignments(stdin, requireEof):
        alignmentNum += 1

        if (headLimit != None) and (alignmentNum > headLimit):
            print >> stderr, "limit of %d alignments reached" % headLimit
            break

        positionalStats = a.positional_stats()

        numPositions = len(positionalStats)
        vec = [None] * (2 * numPositions + 1)
        vec[0] = a.lineNumber

        for (pos, stats) in enumerate(positionalStats):
            if ("m" not in stats):
                raise ValueError, \
                      "\"m\" missing from positional information for alignment at line %d" \
                    % a.lineNumber
            if ("x" not in stats):
                raise ValueError, \
                      "\"x\" missing from positional information for alignment at line %d" \
                    % a.lineNumber

            vec[1 + pos] = stats["m"]
            vec[1 + numPositions + pos] = stats["x"]

        print "\t".join(map(str, vec))
Exemplo n.º 2
0
def main():

    # parse the command line

    reportProgress = None

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg.startswith("--progress=")):
            reportProgress = int_with_unit(argVal)
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            usage("unrecognized option: %s" % arg)

    # process the fasta sequences

    lengthToCount = {}

    inputCount = inputBp = 0
    for (seqLen) in read_fasta_lengths(stdin):
        inputCount += 1
        inputBp += seqLen

        if (reportProgress != None):
            if (inputCount % reportProgress == 0):
                print >>stderr, "%s sequences read (%s nts, avg=%s)" \
                              % (commatize(inputCount),commatize(inputBp),
                                 commatize(int(round(float(inputBp)/inputCount))))

        if (seqLen not in lengthToCount): lengthToCount[seqLen] = 1
        else: lengthToCount[seqLen] += 1

    # report the distribution

    lengths = [length for length in lengthToCount]
    lengths.sort()

    print "\n".join(
        ["%d\t%d" % (length, lengthToCount[length]) for length in lengths])
Exemplo n.º 3
0
def main():
    global debug

    # parse the command line

    countRatio = 1
    headLimit = None
    requireEof = True
    debug = []

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg.startswith("--minwordratio=")) or (
                arg.startswith("--ratio=")) or (arg.startswith("R=")):
            countRatio = float_or_fraction(argVal)
        elif (arg.startswith("--head=")):
            headLimit = int_with_unit(argVal)
        elif (arg in ["--noendmark", "--noeof", "--nomark"]):  # (unadvertised)
            requireEof = False
        elif (arg == "--debug"):
            debug += ["debug"]
        elif (arg.startswith("--debug=")):
            debug += argVal.split(",")
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            usage("unrecognized option: %s" % arg)

    # process the alignments

    alignmentNum = 0
    for a in alignments(stdin, requireEof):
        alignmentNum += 1

        if (headLimit != None) and (alignmentNum > headLimit):
            print >> stderr, "limit of %d alignments reached" % headLimit
            break

        if (alignmentNum > 1): print
        print "\n".join(a.lines)

        motifText = a.motifText
        seqText = a.seqText
        if ("noflip" in debug):
            pass
        elif (a.strand == "-") and (a.start < a.end):
            # alignment was reported in reverse complement of motif, so flip it
            motifText = reverse_complement(motifText)
            seqText = reverse_complement(seqText)

        (motifChunks, seqChunks) = chunkify(a.motif, motifText, seqText)

        wordCounts = Counter()
        for word in seqChunks:
            word = word.replace("-", "")
            if (word != a.motif): word = word.lower()
            wordCounts[word] += 1

        if (a.motif in wordCounts): motifCount = wordCounts[a.motif]
        else: motifCount = 0

        wordCounts = [(wordCounts[word], abs(len(word) - len(a.motif)), word)
                      for word in wordCounts
                      if (wordCounts[word] >= motifCount * countRatio)]
        wordCounts.sort()
        wordCounts.reverse()
        print "# aligned words %s" % \
              " ".join(["%s:%d"%(word,count) for (count,_,word) in wordCounts])

        if ("chunks" in debug):
            if ("noflip" in debug):
                seqChunks = [
                    reverse_complement(word) for word in seqChunks[::-1]
                ]
                motifChunks = [
                    reverse_complement(word) for word in motifChunks[::-1]
                ]
            print "# words: %s" % " ".join(seqChunks)
            print "# motif: %s" % " ".join(motifChunks)

    if (requireEof):
        print "# ncrf end-of-file"
Exemplo n.º 4
0
def main():

	# parse the command line

	arraysFilename   = None
	motifs          = []
	sequenceName    = None
	sequenceLen     = 0
	numRepeats      = None
	genNeighbors    = 0.0
	genMixture      = 0.0
	lengthsFilename = None
	minFill         = None
	errorProfile    = None
	catalogFilename = None
	wrapLength      = 100

	for arg in argv[1:]:
		if ("=" in arg):
			argVal = arg.split("=",1)[1]

		if (arg.startswith("--arrays=")):
			arraysFilename = argVal
		elif (arg.startswith("--name=")):
			sequenceName = argVal
		elif (arg.startswith("--length=")) or (arg.startswith("--len=")) or (arg.startswith("L=")):
			if (argVal.endswith("%")):
				sequenceLen = float(argVal[:-1]) / 100.0
				assert (sequenceLen >= 1.0)
				sequenceLen = ("%",sequenceLen)
			elif (argVal.startswith("+")):
				sequenceLen = int_with_unit(argVal[1:])
				assert (sequenceLen >= 0)
				sequenceLen = ("+",sequenceLen)
			else:
				sequenceLen = int_with_unit(argVal)
				assert (sequenceLen >= 0)
		elif (arg.startswith("--repeats=")) or (arg.startswith("N=")):
			numRepeats = int_with_unit(argVal)
			assert (numRepeats > 0)
		elif (arg.startswith("--motif:neighbor=")):
			genNeighbors = parse_probability(argVal)
		elif (arg.startswith("--motif:mixture=")):
			genMixture = parse_probability(argVal)
		elif (arg.startswith("--lengths=")):
			lengthsFilename = argVal
		elif (arg.startswith("--minfill=")) or (arg.startswith("F=")):
			minFill = int(argVal)
			if (minFill < 0):
				print >>stderr, "WARNING: \"%s\" interpreted as no minimum fill" % argVal
				minFill = None
			if (minFill == 0):
				minFill = None
		elif (arg.startswith("--errors=")):
			errorProfile = None
			if (argVal in ["pacbio","pacbio.v3","pacbio.GIAB","pacbio.giab"]):
				errorProfile = errorProfilePacbioV3
			elif (argVal == "pacbio.v2"):  # for historical reasons, v2 is an alias for v3
				errorProfile = errorProfilePacbioV3
			elif (argVal in ["pacbio.v1","pacbio.Guiblet","pacbio.guiblet"]):
				errorProfile = errorProfilePacbioV1
			elif (argVal in ["pacbio.readsim"]):
				errorProfile = errorProfilePacbioReadsim
			elif (argVal in ["nanopore","nanopore.v3","nanopore.GIAB","nanopore.giab"]):
				errorProfile = errorProfileNanoporeV3
			elif (argVal == "nanopore.v2"):  # for historical reasons, v2 is an alias for v3
				errorProfile = errorProfileNanoporeV3
			elif (argVal in ["nanopore.v1","nanopore.Jain","nanopore.jain"]):
				errorProfile = errorProfileNanoporeV1
			elif (argVal in ["nanopore.readsim"]):
				errorProfile = errorProfileNanoporeReadSim
			elif (":" in argVal):
				try:
					errorProfile = parse_error_spec(argVal)
				except ValueError:
					pass
			else:
				p = parse_probability(argVal)
				errorProfile = {"mm":p, "i":p, "d":p }
			if (errorProfile == None):
				usage("\"%s\" is not a valid error spec" % argVal)
			subProb       = errorProfile["mm"]
			insOpenProb   = errorProfile["i"]
			delOpenProb   = errorProfile["d"]
			insExtendProb = delExtendProb = 0.0
		elif (arg.startswith("--catalog=")):
			catalogFilename = argVal
		elif (arg.startswith("--wrap=")):
			wrapLength = int(argVal)
			if (wrapLength <= 0): wrapLength = None
		elif (arg.startswith("--seed=")):
			# nota bene: if the seed is a number, use it as a number, since
			#            string seeds can produce different sequences on
			#            different versions/builds of python
			seed = argVal
			try:
				seed = int(seed)
			except ValueError:
				try:               seed = float(seed)
				except ValueError: pass
			random_seed(seed)
		elif (arg.startswith("--")):
			usage("unrecognized option: %s" % arg)
		elif (is_nucleotide_string(arg)):
			motifs += [arg.upper()]
		else:
			usage("unrecognized option: %s" % arg)

	if (arraysFilename != None):
		if (motifs != []):
			usage("command line <motif>s cannot be used with --arrays")
		if (numRepeats != None):
			usage("--repeats cannot be used with --arrays")
		if (lengthsFilename != None):
			usage("--lengths cannot be used with --arrays")
		if (genNeighbors != 0.0):
			usage("--motif:neighbor cannot be used with --arrays")
		if (genMixture != 0.0):
			usage("--motif:mixture cannot be used with --arrays")
	elif (motifs == []):
		usage("you have to give me at least one motif")

	if (numRepeats == None) and (arraysFilename != None):
		numRepeats = 1
	
	# read the arrays file, if we have one

	repeatLengths = {}
	haveSpecificArrays = False

	if (arraysFilename != None):
		haveSpecificArrays = True
		f = file(arraysFilename,"rt")
		numRepeats = 0
		for (length,motif,_) in read_arrays(f,arraysFilename):
			numRepeats += 1
			if (motif not in repeatLengths):
				motifs += [(motif)]
				repeatLengths[motif] =  [length]
			else:
				repeatLengths[motif] += [length]
		f.close()

		if (motifs == []):
			usage("array file \"%s\" contains no arrays" % arraysFilename)

	# read the lengths file

	if (repeatLengths == {}):
		if (lengthsFilename == None):
			lengths = read_integers(stdin)
			for motif in motifs:
				repeatLengths[motif] = lengths
		elif ("{motif}" not in lengthsFilename):
			f = file(lengthsFilename,"rt")
			lengths = read_integers(f,lengthsFilename)
			f.close()
			for motif in motifs:
				repeatLengths[motif] = lengths
		else:
			for motif in motifs:
				motifLengthsFilename = lengthsFilename.replace("{motif}",motif)
				f = file(motifLengthsFilename,"rt")
				lengths = read_integers(f,motifLengthsFilename)
				f.close()
				repeatLengths[motif] = lengths

	# generate the number and type of motifs we'll embed
	#
	# note: to satisfy the requirement that the same seed generates the same
	#       pre-error sequence, we should have no variance in the use of the
	#       PRNG until after we've generated that sequence; see "point A" below

	embeddings = []

	if (haveSpecificArrays):
		for motif in motifs:
			for length in repeatLengths[motif]:
				strand = choice(["+","-"])
				offset = choice(xrange(len(motif)))
				embeddings += [(1.0,motif,motif,strand,offset,length)]
		shuffle(embeddings)
	else:
		for _ in xrange(numRepeats):
			motif = choice(motifs)
			length = choice(repeatLengths[motif])
			u = unit_random()
			if (genNeighbors > 0) and (u < genNeighbors):
				motif = motif_neighbor(motif)
				(mix,motif2) = (1.0,motif)
			elif (genMixture > 0) and (u < genNeighbors+genMixture):
				(mix,motif2) = (0.5,motif_neighbor(motif))
			else:
				(mix,motif2) = (1.0,motif)
			strand = choice(["+","-"])
			offset = choice(xrange(len(motif)))
			embeddings += [(mix,motif,motif2,strand,offset,length)]

	totalRepeatBp = sum([length for (_,_,_,_,_,length) in embeddings])

	# assign each repeat a position within the "fill" sequence;  note that we
	# might have more than one repeat assigned to the same position, in which
	# case they will be back-to-back with no fill between them

	if (type(sequenceLen) == tuple):
		(op,sequenceLen) = sequenceLen
		if (op == "%"):
			sequenceLen = int(round(totalRepeatBp*sequenceLen))
		else: # if (op == "+"):
			sequenceLen = totalRepeatBp + sequenceLen

	if (totalRepeatBp > sequenceLen):
		fillBp = 0
		if (sequenceLen > 0):
			print >>stderr, "WARNING: length of embedded repeats (%d) exceeds specified" % totalRepeatBp
			print >>stderr, "         sequence length (%d); there will be no fill DNA"   % sequenceLen
	elif (minFill != None):
		fillBp = sequenceLen - totalRepeatBp
		totalMinFill = (numRepeats+1) * minFill
		if (totalMinFill > fillBp):
			print >>stderr, "WARNING: minimum fill of %d cannot be achieved"           % minFill
			print >>stderr, "         total minimum fill (%d) exceeds total fill (%d)" % (totalMinFill,fillBp)
			minFill = fillBp / (numRepeats+1)
		fillBp -= minFill * (numRepeats+1)
	else:
		fillBp = sequenceLen - totalRepeatBp

	fillPositions = [randint(0,fillBp) for _ in xrange(numRepeats)]
	fillPositions.sort()

	if (minFill != None):
		fillBp += minFill * (numRepeats+1)
		for rptNum in xrange(numRepeats):
			fillPositions[rptNum] += (rptNum+1) * minFill

	# generate the sequence

	catalog = None
	if (catalogFilename != None):
		catalog = []

	fillSeq = str(EchyDna(fillBp))
	seq = []
	seqPos  = 0
	prevEnd = 0
	fillPos = 0
	for (ix,pos) in enumerate(fillPositions):
		if (fillPos < pos):
			seq += [fillSeq[fillPos:pos]]
			seqPos  += pos - fillPos
			fillPos =  pos

		(mix,motif,motif2,strand,offset,length) = embeddings[ix]
		if (catalog != None):
			c = CatalogEntry()
			c.start        = seqPos
			c.end          = seqPos+length
			c.mix          = mix
			c.motif        = motif
			c.motif2       = motif2
			c.strand       = strand
			c.repeatLength = length
			c.offset       = offset
			catalog += [c]

		enoughCopies = (length+offset+len(motif)-1) / len(motif)
		if (strand == "-"): motif = reverse_complement(motif)

		if (mix >= 1.0):
			repeat = motif * enoughCopies
		else:
			repeat = []
			for _ in xrange(enoughCopies):
				if (unit_random() < mix): repeat += [motif]
				else:                     repeat += [motif2]
			repeat = "".join(repeat)

		seq += repeat[offset:offset+length]
		seqPos += length
		prevEnd = seqPos

	if (fillPos < fillBp):
		seq += [fillSeq[fillPos:fillBp]]

	seq = "".join(seq)

	#=== point A: it's now safe to make additional use of the PRNG ===

	# apply error profile

	events = profile = None
	if (argVal in ["pacbio","pacbio.v3","pacbio.GIAB","pacbio.giab"]):
		errorProfile = errorProfilePacbioV3
	elif (argVal == "pacbio.v2"):  # for historical reasons, v2 is an alias for v3
		errorProfile = errorProfilePacbioV3
	elif (argVal in ["pacbio.v1","pacbio.Guiblet","pacbio.guiblet"]):
		errorProfile = errorProfilePacbioV1
	elif (argVal in ["pacbio.readsim"]):
		errorProfile = errorProfilePacbioReadsim
	elif (argVal in ["nanopore","nanopore.v3","nanopore.GIAB","nanopore.giab"]):
		errorProfile = errorProfileNanoporeV3
	elif (argVal == "nanopore.v2"):  # for historical reasons, v2 is an alias for v3
		errorProfile = errorProfileNanoporeV3
	elif (argVal in ["nanopore.v1","nanopore.Jain","nanopore.jain"]):
		errorProfile = errorProfileNanoporeV1
	elif (argVal in ["nanopore.readsim"]):
		errorProfile = errorProfileNanoporeReadSim
	elif (type(errorProfile) == float):
		eRate = errorProfile / 3.0;
		profile = {"mm":eRate, "i":eRate, "d":eRate }
	elif (type(errorProfile) == dict):
		profile = dict(errorProfile)

	if (profile != None):
		print >>stderr, "(applying error profile mm=%.2f%% i=%.2f%% d=%.2f%%)" \
		             % (100*profile["mm"],100*profile["i"],100*profile["d"])
		(seq,catalog,events) = apply_errors(profile,seq,catalog)

	# write the sequence

	if (sequenceName != None):
		print ">%s" % sequenceName

	if (wrapLength == None):
		print seq
	else:
		for i in range(0,len(seq),wrapLength):
			print seq[i:i+wrapLength]

	# write the catalog

	if (catalogFilename != None):
		catalogF = file(catalogFilename,"wt")
		if (sequenceName in [None,""]): seqNameForCatalog = "seq"
		else:                           seqNameForCatalog = sequenceName

		if (events == None):
			print >>catalogF, "#%s\t%s\t%s\t%s\t%s\t%s\t%s" \
			                % ("chrom","start","end","motif","rptLen","len","fill")
		else:
			print >>catalogF, "#%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" \
			                % ("chrom","start","end","motif","rptLen","len","fill",
			                   "mRatio","m","mm","i","d")

		prevEnd = 0
		for (catIx,c) in enumerate(catalog):
			motifStr = c.motif
			if (c.mix < 1.0): motifStr += "," + c.motif2
			motifStr += ".%s%s" % (c.offset,c.strand)
			if (events == None):
				print >>catalogF, "%s\t%s\t%s\t%s\t%s\t%s\t%s" \
				                % (seqNameForCatalog,c.start,c.end,motifStr,
				                   c.repeatLength,c.end-c.start,c.start-prevEnd)
			else:
				if (catIx in events):
					(m,mm,i,d) = events[catIx]
					mRatio = "%.1f%%" % (100.0*m/(m+mm+i+d))
				else:
					mRatio = m = mm = i = d = "NA"
				print >>catalogF, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" \
				                % (seqNameForCatalog,c.start,c.end,motifStr,
				                   c.repeatLength,c.end-c.start,c.start-prevEnd,
				                   mRatio,m,mm,i,d)
			prevEnd = c.end

		catalogF.close()
Exemplo n.º 5
0
def main():
	# parse the command line

	numValues = None
	mu        = 0.0
	sigma     = 1.0
	roundEm   = None
	precision = None

	for arg in argv[1:]:
		if ("=" in arg):
			argVal = arg.split("=",1)[1]

		if (arg.startswith("--mu=")):
			mu = float_or_fraction(argVal)
		elif (arg.startswith("--sigma=")):
			sigma = float_or_fraction(argVal)
		elif (arg == "--round"):
			roundEm = "round"
		elif (arg == "--floor"):
			roundEm = "floor"
		elif (arg == "--ceiling"):
			roundEm = "ceiling"
		elif (arg.startswith("--precision=")):
			precision = int(argVal)
		elif (arg.startswith("--seed=")):
			# nota bene: if the seed is a number, use it as a number, since
			#            string seeds can produce different sequences on
			#            different versions/builds of python
			seed = argVal
			try:
				seed = int(seed)
			except ValueError:
				try:               seed = float(seed)
				except ValueError: pass
			random_seed(seed)
		elif (arg.startswith("--")):
			usage("unrecognized option: %s" % arg)
		elif (numValues == None):
			numValues = int_with_unit(arg)
		else:
			usage("unrecognized option: %s" % arg)

	if (numValues == None):
		numValues = 1

	if (roundEm != None) and (precision != None):
		usage("can't use --precision with --%s" % roundEm)

	if   (precision == None): vFmt = "%s"
	elif (precision <= 0):    vFmt = "%d."
	else:                     vFmt = "%%.%df" % precision

	# generate the values

	for _ in xrange(numValues):
		v = gauss(mu,sigma)
		if   (roundEm == "round"):   v = int(round(v))
		elif (roundEm == "floor"):   v = int(floor(v))
		elif (roundEm == "ceiling"): v = int(ceil (v))

		print vFmt % v
Exemplo n.º 6
0
def main():
	global nameFieldW,lengthFieldW,countFieldW,rangeFieldW
	global debug

	# parse the command line

	genomeFilename      = None
	readsFilename       = None
	cigarFilename       = None
	intervalsFilename   = None
	intervalsAreCatalog = False
	motifs              = None
	chromsOfInterest    = None
	minLength           = None
	noiselessGenome     = True
	reportProgress      = None
	nameFieldW          = 1
	lengthFieldW        = 1
	countFieldW         = 1
	rangeFieldW         = 1
	debug               = []

	for arg in argv[1:]:
		if ("=" in arg):
			argVal = arg.split("=",1)[1]

		if (arg.startswith("--genome=")):
			genomeFilename = argVal
		elif (arg.startswith("--reads=")) or (arg.startswith("--read=")):
			readsFilename = argVal
		elif (arg.startswith("--cigars=")) or (arg.startswith("--cigar=")):
			cigarFilename = argVal
		elif (arg.startswith("--intervals=")) or (arg.startswith("--interval=")):
			if (intervalsFilename != None):
				usage("--intervals and --catalog are mutually exclusive")
			intervalsFilename   = argVal
			intervalsAreCatalog = False
		elif (arg.startswith("--catalog=")):
			if (intervalsFilename != None):
				usage("--intervals and --catalog are mutually exclusive")
			intervalsFilename   = argVal
			intervalsAreCatalog = True
		elif (arg.startswith("--motif=")):
			if (motifs == None): motifs = set()
			motifs.add(argVal)
		elif (arg.startswith("--chromosome=")) or (arg.startswith("--chromosomes=")) \
		  or (arg.startswith("--chrom="))      or (arg.startswith("--chroms=")):
			if (chromsOfInterest == None): chromsOfInterest = set()
			for chrom in argVal.split(","):
				chromsOfInterest.add(chrom)
		elif (arg.startswith("--minlength=")) or (arg.startswith("--minlen=")):
			try:
				minLength = int(argVal)
				if (minLength < 0): raise ValueError
				if (minLength == 0): minLength = None
			except ValueError:
				usage("bad length in \"%s\"" % arg)
		elif (arg == "--noisygenome"):
			noiselessGenome = False
		elif (arg.startswith("--progress=")):
			reportProgress = int_with_unit(argVal)
		elif (arg.startswith("--fields=")) or (arg.startswith("F=")):
			(nameFieldW,lengthFieldW,countFieldW,rangeFieldW) = argVal.split(",",4)
			nameFieldW   = max(int(nameFieldW),1)
			lengthFieldW = max(int(lengthFieldW),1)
			countFieldW  = max(int(countFieldW),1)
			rangeFieldW  = max(int(rangeFieldW),1)
		elif (arg.startswith("--namefield=")) or (arg.startswith("F1=")):
			nameFieldW = max(int(argVal),1)
		elif (arg.startswith("--lengthfield=")) or (arg.startswith("F2=")):
			lengthFieldW = max(int(argVal),1)
		elif (arg.startswith("--countfield=")) or (arg.startswith("F3=")):
			countFieldW = max(int(argVal),1)
		elif (arg.startswith("--intervalfield=")) or (arg.startswith("F4=")):
			rangeFieldW = max(int(argVal),1)
		elif (arg == "--debug"):
			debug += ["debug"]
		elif (arg.startswith("--debug=")):
			debug += argVal.split(",")
		elif (arg.startswith("--")):
			usage("unrecognized option: %s" % arg)
		else:
			usage("unrecognized option: %s" % arg)

	if (genomeFilename == None):
		usage("you need to give me a genome file")
	if (readsFilename == None):
		usage("you need to give me a reads file")
	if (cigarFilename == None):
		usage("you need to give me a cigar strings file")

	if (motifs != None) and (not intervalsAreCatalog):
		usage("--motifs requires --catalog")

	# read the intervals
	#
	# nota bene: this can modify chromsOfInterest, restricting it to the
	# chromosomes in the intervals list

	chromToIntervals = None
	motifsSeen = set()

	if (intervalsFilename != None):
		chromToIntervals = {}

		if (intervalsFilename.endswith(".gz")) or (intervalsFilename.endswith(".gzip")):
			intervalsF = gzip_open(intervalsFilename,"rt")
		else:
			intervalsF = file(intervalsFilename,"rt")

		for (lineNumber,chrom,gStart,gEnd,tags) in read_intervals(intervalsF):
			if (chromsOfInterest != None) and (chrom not in chromsOfInterest): continue
			if (chrom not in chromToIntervals): chromToIntervals[chrom] = []

			if (intervalsAreCatalog):
				if (tags == None):
					exit("%s: not enough fields at line %d (%d, expected at least %d)"
					   % (os_path.basename(argv[0]),lineNumber,len(fields),4))
				(motif,strand) = (tags[0][:-1],tags[0][-1:])
				if ("." in motif): motif = motif[:motif.find(".")]
				if (strand not in ["+","-"]) or (not is_nucleotide_string(motif)):
					exit("%s: bad motif at line %d: \"%s\""
					   % (os_path.basename(argv[0]),lineNumber,tags[0]))

				if (motifs != None):
					if (motif not in motifs): continue
					motifsSeen.add(motif)
			else:
				motif = strand = None

			chromToIntervals[chrom] += [(gStart,gEnd,motif,strand)]

		intervalsF.close()

		for chrom in chromToIntervals:
			chromToIntervals[chrom].sort()

		if (chromsOfInterest == None):
			chromsOfInterest = set(chromToIntervals)
		else:
			for chrom in chromsOfInterest:
				if (chrom not in chromToIntervals):
					chromsOfInterest.remove(chrom)

	if (motifs != None):
		for motif in motifs:
			if (motif not in motifsSeen):
				print >>stderr, "WARNING \"%s\" was not seen in %s" \
				              % (motif,intervalsFilename)

	# read the genome

	chromToSequence = {}

	if (genomeFilename.endswith(".gz")) or (genomeFilename.endswith(".gzip")):
		genomeF = gzip_open(genomeFilename,"rt")
	else:
		genomeF = file(genomeFilename,"rt")

	for (chrom,seq) in read_fasta_sequences(genomeF,chromsOfInterest):
		if (chrom in chromToSequence):
			exit("%s: \"%s\" appears more than once in \"%s\""
			   % (os_path.basename(argv[0]),chrom,genomeFilename))
		chromToSequence[chrom] = seq

	genomeF.close()

	if (chromsOfInterest != None):
		for chrom in chromsOfInterest:
			if (chrom not in chromToSequence):
				exit("%s: \"%s\" doesn't appear in \"%s\""
				   % (os_path.basename(argv[0]),chrom,genomeFilename))

	# read the cigar strings

	if (cigarFilename.endswith(".gz")) or (cigarFilename.endswith(".gzip")):
		cigarF = gzip_open(cigarFilename,"rt")
	else:
		cigarF = file(cigarFilename,"rt")

	readNameToCigar = {}

	for (lineNumber,line,readName,chrom,strand,gStart,gEnd,cigar) in read_cigars(cigarF):
		if (chromsOfInterest != None) and (chrom not in chromsOfInterest): continue
		(rLength,gLength) = cigar_lengths(cigar)
		readNameToCigar[readName] = (chrom,gStart,gEnd,gLength,strand,rLength,cigar)
		if (gLength != gEnd-gStart):
			exit("%s: bad cigar line (at line %d); cigar doesn't match interval length (%d vs %d)\n%s"
			   % (os_path.basename(argv[0]),lineNumber,gLength,gEnd-gStart,line))

	cigarF.close()

	# process the reads

	if (readsFilename.endswith(".gz")) or (readsFilename.endswith(".gzip")):
		readsF = gzip_open(readsFilename,"rt")
	else:
		readsF = file(readsFilename,"rt")

	readNum = alignmentsReported = 0
	for (readName,rNucs) in read_fasta_sequences(readsF):
		readNum += 1
		if (reportProgress != None) \
		   and ((readNum == 1) or (readNum % reportProgress == 0)):
			print >>stderr, "progress: processing read #%s %s (%s alignments reported so far)" \
			              % (commatize(readNum),readName,commatize(alignmentsReported))

		if (readName not in readNameToCigar):
			exit("%s: \"%s\" doesn't appear in \"%s\""
			   % (os_path.basename(argv[0]),readNameToCigar,cigarFilename))

		(chrom,gStart,gEnd,gLength,strand,rLength,cigar) = readNameToCigar[readName]
		gNucs = chromToSequence[chrom][gStart:gEnd]

		if (strand == "-"):
			gNucs = reverse_complement(gNucs)

		a = Alignment()
		a.readName = readName
		a.rStart   = 0
		a.rEnd     = rLength
		a.rLength  = rLength
		a.rNucs    = rNucs
		a.chrom    = chrom
		a.strand   = strand
		a.gStart   = gStart
		a.gEnd     = gEnd
		a.gNucs    = gNucs
		a.score    = 0
		a.motif    = "%s:%d-%d%s" % (chrom,a.gStart,a.gEnd,strand)

		(a.rText,a.gText) = reconstruct_alignment(rNucs,gNucs,cigar)

		if (chromToIntervals == None):
			if (minLength != None) and (a.gEnd-a.gStart < minLength):
				continue
			print_alignment(a)
			alignmentsReported += 1
		else:
			intervals = chromToIntervals[chrom]
			for (s,e,motif,mStrand) in intersecting_intervals(intervals,gStart,gEnd):
				aSliced = slice_alignment(a,s,e)
				if (minLength != None) and (aSliced.gEnd-aSliced.gStart < minLength):
					continue
				print_alignment(aSliced)
				alignmentsReported += 1

				if ("intervalsanity" in debug):
					rText    = remove_gaps(aSliced.rText)
					realText = rNucs[aSliced.rStart:aSliced.rEnd]
					if (realText != rText):
						exit("%s: sanity check failed for read:\n\"%s\"\n\"%s\""
						   % (os_path.basename(argv[0]),rText,realText))

					gText    = remove_gaps(aSliced.gText).upper()
					realText = chromToSequence[chrom][aSliced.gStart:aSliced.gEnd]
					if (strand == "-"): realText = reverse_complement(realText)
					if (realText != gText):
						exit("%s: sanity check failed for genome:\n\"%s\"\n\"%s\""
						   % (os_path.basename(argv[0]),gText,realText))
					print >>stderr, "%s: sanity check passed for read %s" \
					              % (os_path.basename(argv[0]),readName)

				if (motif != None):
					positionalStats = positonal_stats(aSliced,motif,mStrand,
					                                  noiselessGenome=noiselessGenome)
					print_positonal_stats(positionalStats)

	readsF.close()
	print "# ncrf end-of-file"

	if (reportProgress != None):
		print >>stderr, "progress: %s reads processed (%s alignments reported)" \
		              % (commatize(readNum),commatize(alignmentsReported))
Exemplo n.º 7
0
def main():
    global summaryHeaderLine
    global debug

    summaryHeaderLine = None

    # parse the command line

    inputFilenames = []
    outTemplate = None
    headLimit = None
    debug = []

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg.startswith("--out=")):
            outTemplate = argVal
        elif (arg.startswith("--head=")):
            headLimit = int_with_unit(argVal)
        elif (arg == "--debug"):
            debug += ["debug"]
        elif (arg.startswith("--debug=")):
            debug += argVal.split(",")
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            if (arg not in inputFilenames): inputFilenames += [arg]

    if (inputFilenames == []):
        usage("you have to give me at least one summary file")

    writeSingletonsSeparately = (outTemplate != None) and ("{motif}"
                                                           in outTemplate)

    # collect the alignments

    seqToSummaries = {}
    seqOrder = []

    summaryNum = 0
    for filename in inputFilenames:
        if (filename.endswith(".gz")) or (filename.endswith(".gzip")):
            f = gzip_open(filename, "rt")
        else:
            f = file(filename, "rt")
        for summary in read_summary(f, filename):
            summaryNum += 1

            if (headLimit != None) and (summaryNum > headLimit):
                print >> stderr, "limit of %d summaries reached" % headLimit
                break

            if (summary.seq not in seqToSummaries):
                seqOrder += [summary.seq]
                seqToSummaries[summary.seq] = []
            seqToSummaries[summary.seq] += [summary]

        f.close()

    # partition the alignments into overlapping groups

    seqToGroups = {}

    for seq in seqOrder:
        seqToGroups[seq] = overlapping_groups(seqToSummaries[seq])

    if ("groups" in debug):
        for seq in seqOrder:
            for group in seqToGroups[seq]:
                print >> stderr, "==="
                for summary in group:
                    print >> stderr, summary.line

    # collect groups by motif subset

    subsetToGroups = {}
    singletons = set()

    for seq in seqOrder:
        for group in seqToGroups[seq]:
            subset = set([summary.motif for summary in group])
            subset = list(subset)
            subset.sort()
            subset = tuple(subset)

            if (subset not in subsetToGroups):
                subsetToGroups[subset] = [group]
            else:
                subsetToGroups[subset] += [group]

            if (len(subset) == 1):
                singletons.add(subset[0])

    # if we're to report un-overlapped alignments separately, do so now (and
    # remove them from the groups)

    singletons = list(singletons)
    singletons.sort()

    if (writeSingletonsSeparately):
        for motif in singletons:
            subset = (motif, )

            motifFilename = outTemplate.replace("{motif}", motif)
            motifF = file(motifFilename, "wt")
            print >> stderr, "writing to \"%s\"" % motifFilename

            if (summaryHeaderLine != None):
                print >> motifF, summaryHeaderLine
            for group in subsetToGroups[subset]:
                for summary in group:
                    print >> motifF, summary.line

            del subsetToGroups[subset]

    # report overlapping alignment groups (and un-overlapped groups if we
    # didn't report them already)

    if (outTemplate == None):
        outF = stdout
        if (list(subsetToGroups) == []):
            print >> stderr, "no alignments to write to console"
    elif ("{motif}" not in outTemplate):
        outF = file(outTemplate, "wt")
        if (list(subsetToGroups) == []):
            print >> stderr, "no alignments to write to \"%s\"" % outTemplate
        else:
            print >> stderr, "writing to \"%s\"" % outTemplate
    else:
        outFilename = outTemplate.replace("{motif}", "overlaps")
        outF = file(outFilename, "wt")
        if (list(subsetToGroups) == []):
            print >> stderr, "no alignments to write to \"%s\"" % outFilename
        else:
            print >> stderr, "writing to \"%s\"" % outFilename

    motifCountToSubsets = {}
    for subset in subsetToGroups:
        motifCount = len(subset)
        if (motifCount not in motifCountToSubsets):
            motifCountToSubsets[motifCount] = [subset]
        else:
            motifCountToSubsets[motifCount] += [subset]

    motifCounts = list(motifCountToSubsets)
    motifCounts.sort()

    isFirstGroup = True
    for motifCount in motifCounts:
        subsets = motifCountToSubsets[motifCount]
        subsets.sort()
        for subset in subsets:
            for group in subsetToGroups[subset]:
                if (isFirstGroup):
                    if (summaryHeaderLine != None):
                        print >> outF, summaryHeaderLine
                    isFirstGroup = False
                else:
                    print >> outF  # (line to separate groups)

                for summary in group:
                    print >> outF, summary.line

    if (outF != stdout):
        outF.close()
Exemplo n.º 8
0
def main():

    # parse the command line

    names = []
    nameToVal = {}
    fixedNames = Set()
    radius = 1
    ballKind = "sparse hypercube"
    sampleSize = None
    rejectCriteria = []
    excludeCenter = False

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg.startswith("--fixed=")):
            for name in argVal.split(","):
                fixedNames.add(name)
        elif (arg.startswith("--radius=")):
            if ("by" not in argVal):
                radius = abs(int(argVal))
            else:
                (radius, step) = argVal.split("by", 1)
                radius = abs(int(radius))
                step = abs(int(step))
                assert (radius % step == 0)
                if (step != 1):
                    radius = (radius / step, step)
        elif (arg in ["--ball:sparse", "--ball=sparse"]):
            ballKind = "sparse hypercube"
        elif (arg in ["--ball:hyper", "--ball=hyper"]):
            ballKind = "hypercube"
        elif (arg in ["--ball:spikey", "--ball=spikey", "--spikey"]):
            ballKind = "spikey burr"
        elif (arg.startswith("--sample=")):
            sampleSize = int_with_unit(argVal)
        elif (arg.startswith("--reject=")):
            rejectCriteria += [argVal]
        elif (arg == "--nocenter"):
            excludeCenter = True
        elif (arg.startswith("--seed=")):
            # nota bene: if the seed is a number, use it as a number, since
            #            string seeds can produce different sequences on
            #            different versions/builds of python
            seed = argVal
            try:
                seed = int(seed)
            except ValueError:
                try:
                    seed = float(seed)
                except ValueError:
                    pass
            random_seed(seed)
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        elif ("=" in arg):
            name = arg.split("=", 1)[0]
            val = int(argVal)
            if (name in nameToVal) and (nameToVal[name] != val):
                usage(
                    "you have given me more than one value for %s, %d and %d" %
                    (name, nameToVal[name], val))
            if (name not in nameToVal):
                names += [name]
                nameToVal[name] = val
        else:
            usage("unrecognized option: %s" % arg)

    if (names == []):
        usage("you have to give me at least one parameter to vary")

    for name in fixedNames:
        if (name not in nameToVal):
            print >> stderr, "WARNING: no value was provided for \"%s\"" % name

    # separate fixed and varying names

    variables = []
    variableToVal = {}

    for name in names:
        if (name not in fixedNames):
            variables += [name]
            variableToVal[name] = nameToVal[name]

    # generate the parameter sets

    if (ballKind == "spikey burr"):
        ball = SpikeyBurr(variables, variableToVal, radius)
    elif (ballKind == "sparse hypercube"):
        if (sampleSize == None): sampleSize = 1
        ball = SparseHyperCube(variables, variableToVal, radius, sampleSize,
                               excludeCenter)
    else:  # if (ballKind == "hypercube"):
        ball = HyperCube(variables, variableToVal, radius)

    if (sampleSize != None):
        ballSize = ball.size(excludeCenter)
        if (sampleSize >= ballSize):
            sampleSize = None

    if (sampleSize == None):
        for params in ball.ball():
            if (excludeCenter):
                if (params_are_same(params, variableToVal)): continue

            reject = False
            for formula in rejectCriteria:
                if (evaluate(formula, params) == True):
                    reject = True
                    break
            if (reject): continue

            for name in names:
                if (name not in params): params[name] = nameToVal[name]
            print " ".join(["%s=%d" % (name, params[name]) for name in names])
    else:
        leftToSample = sampleSize
        leftInBall = ballSize
        for params in ball.ball():
            if (excludeCenter):
                if (params_are_same(params, variableToVal)): continue

            reject = False
            for formula in rejectCriteria:
                if (evaluate(formula, params) == True):
                    reject = True
                    break
            if (reject): continue

            if (randint(0, leftInBall - 1) < leftToSample):
                for name in names:
                    if (name not in params): params[name] = nameToVal[name]
                print " ".join(
                    ["%s=%d" % (name, params[name]) for name in names])
                leftToSample -= 1
            leftInBall -= 1
Exemplo n.º 9
0
def main():

	# parse the command line

	writeHeader = False
	writeWhat   = "per alignment"
	headLimit   = None
	requireEof  = True

	for arg in argv[1:]:
		if ("=" in arg):
			argVal = arg.split("=",1)[1]

		if (arg in ["--withheader","--with=header","--with:header"]):
			writeHeader = True
		elif (arg in ["--sumonly","--sum=only","--sum:only"]):
			writeWhat = "sum only"
		elif (arg.startswith("--head=")):
			headLimit = int_with_unit(argVal)
		elif (arg in ["--noendmark","--noeof","--nomark"]):   # (unadvertised)
			requireEof = False
		elif (arg.startswith("--")):
			usage("unrecognized option: %s" % arg)
		else:
			usage("unrecognized option: %s" % arg)

	# process the alignments

	sum = {"m":0, "mm":0, "io":0, "ix":0, "do":0, "dx":0}

	alignmentNum = 0
	for a in alignments(stdin,requireEof):
		alignmentNum +=1 

		if (headLimit != None) and (alignmentNum > headLimit):
			print >>stderr, "limit of %d alignments reached" % headLimit
			break

		(nMatch,nMismatch,nInsO,nInsX,nDelO,nDelX) = extract_events(a)

		if (writeHeader):
			print "\t".join(["line","motif","mRatio","m","mm","io","ix","do","dx"])
			writeHeader = False

		if (writeWhat == "per alignment"):
			vec = [a.lineNumber,a.motif,a.mRatio,nMatch,nMismatch,nInsO,nInsX,nDelO,nDelX]
			print "\t".join(map(str,vec))

		sum["m"]  += nMatch
		sum["mm"] += nMismatch
		sum["io"] += nInsO
		sum["ix"] += nInsX
		sum["do"] += nDelO
		sum["dx"] += nDelX

	sum["events"] = (sum["m"] + sum["mm"] + sum["io"] + sum["ix"] + sum["do"] + sum["dx"])

	if (alignmentNum == 0):
		print >>stderr, "WARNING: input contained no alignments"
	elif (writeWhat == "sum only"):
		mRatio = float(sum["m"]) / sum["events"]
		mRatio = "%.3f" % mRatio
		vec = ["all",a.motif,mRatio,sum["m"],sum["mm"],sum["io"],sum["ix"],sum["do"],sum["dx"]]
		print "\t".join(map(str,vec))
Exemplo n.º 10
0
def main():
	global debug

	# parse the command line

	distributionFilename = None
	remainderFilename    = None
	wrapLength           = 100
	reportProgress       = None
	debug                = []

	for arg in argv[1:]:
		if ("=" in arg):
			argVal = arg.split("=",1)[1]

		if (arg.startswith("--remainder=")):
			remainderFilename = argVal
		elif (arg.startswith("--wrap=")):
			wrapLength = int(argVal)
			if (wrapLength <= 0): wrapLength = None
		elif (arg.startswith("--seed=")):
			random_seed(argVal)
		elif (arg.startswith("--progress=")):
			reportProgress = int_with_unit(argVal)
		elif (arg == "--debug"):
			debug += ["debug"]
		elif (arg.startswith("--debug=")):
			debug += argVal.split(",")
		elif (arg.startswith("--")):
			usage("unrecognized option: %s" % arg)
		elif (distributionFilename == None):
			distributionFilename = arg
		else:
			usage("unrecognized option: %s" % arg)

	if (distributionFilename == None):
		usage("you must provide a length-distribution filename")

	# read the distribution

	intervals = IntervalDict()

	distribF = file(distributionFilename,"rt")
	for spec in read_distribution_spec(distribF,distributionFilename):
		(lineNumber,minLength,maxLength,outCount,inCount) = spec

		interval = intervals.add(minLength,maxLength)
		if (interval == None): # interval overlaps an existing interval
			interval = Interval(minLength,maxLength)
			previous = intervals.overlapper(minLength,maxLength)
			assert (False), \
			       "%s (line %d) overlaps %s (line %d)" \
			     % (interval,lineNumber,previous,previous.lineNumber)

		interval.lineNumber = lineNumber
		interval.outCount   = outCount
		interval.inCount    = inCount

	distribF.close ()

	if ("distribution" in debug):
		for interval in intervals:
			print >>stderr, "%s %d %d" \
			              % (interval,interval.outCount,interval.inCount)

	# process the reads
	#
	# this filters reads based on the length (on the interval containing the
	# length); if we expect to see E more sequences of this length (including
	# this one), and we are to output N of those, we output this sequence with
	# probability N/E; and we adjust N and E for this length accordingly

	inputCount = outputCount = inputBp = outputBp = 0
	for (name,seq) in read_fasta_sequences(stdin):
		seqLen = len(seq)
		inputCount += 1
		inputBp    += seqLen

		if (reportProgress != None):
			if (inputCount % reportProgress == 0):
				print >>stderr, "%s sequences read, %s written (%.1f%%); %s nts read, %s written" \
				              % (commatize(inputCount),commatize(outputCount),
				                 100.0*outputCount/inputCount,
				                 commatize(inputBp),commatize(outputBp))

		try: interval = intervals[seqLen]
		except KeyError: continue

		if (interval.inCount <= 0):
			print >>stderr, "ERROR: for length %d (%s), actual input exceeded expected input count" \
			              % (seqLen,interval)
			if (remainderFilename != None):
				print >>stderr, "      (writing remainders to %s)" % remainderFilename
				remainderF = file(remainderFilename,"wt")
				write_remainders(remainderF,intervals)
				remainderF.close ()
			assert (False)

		if (interval.outCount == 0):
			keepSeq = False
		else:
			keepSeq = (randint(1,interval.inCount) <= interval.outCount)

		interval.inCount  -= 1
		if (not keepSeq): continue

		interval.outCount -= 1
		outputCount += 1
		outputBp    += seqLen
		print ">%s" % name
		if (wrapLength == None):
			print seq
		else:
			for i in range(0,seqLen,wrapLength):
				print seq[i:i+wrapLength]

	# write the remainders

	if (remainderFilename != None):
		remainderF = file(remainderFilename,"wt")
		write_remainders(remainderF,intervals)
		remainderF.close ()
Exemplo n.º 11
0
def main():
    global warnOnError

    # parse the command line

    minMapQ = None
    writeHeader = False
    writeWhat = "per alignment"
    warnOnError = False
    headLimit = None
    reportProgress = None

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg.startswith("--mapq=")) or (arg.startswith("--MAPQ=")) or (
                arg.startswith("MAPQ=")):
            minMapQ = int(argVal)
        elif (arg in ["--withheader", "--with=header", "--with:header"]):
            writeHeader = True
        elif (arg in ["--sumonly", "--sum=only", "--sum:only"]):
            writeWhat = "sum only"
        elif (arg == "--warnandcontinue"):
            warnOnError = True
        elif (arg.startswith("--head=")):
            headLimit = int_with_unit(argVal)
        elif (arg.startswith("--progress=")):
            reportProgress = int_with_unit(argVal)
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            usage("unrecognized option: %s" % arg)

    # process the sam records

    sum = {"m": 0, "mm": 0, "io": 0, "ix": 0, "do": 0, "dx": 0}

    recordNum = alignmentNum = 0
    for a in read_sam_plain(stdin, minMapQ=minMapQ):
        recordNum += 1
        if (reportProgress != None) and (recordNum % reportProgress == 0):
            sum["events"] = (sum["m"] + sum["mm"] + sum["io"] + sum["ix"] +
                             sum["do"] + sum["dx"])
            mRatio = float(sum["m"]) / sum["events"]
            vec = [
                mRatio, sum["m"], sum["mm"], sum["io"], sum["ix"], sum["do"],
                sum["dx"]
            ]
            print >>stderr, "progress: processing sam record %s (mRatio=%.3f m=%d mm=%d io=%d ix=%d do=%d dx=%d)" \
                          % (commatize(recordNum),
                             mRatio,sum["m"],sum["mm"],sum["io"],sum["ix"],sum["do"],sum["dx"])

        if (headLimit != None) and (recordNum > headLimit):
            print >> stderr, "limit of %s sam records reached" % commatize(
                headLimit)
            break

        if (a.rName == "*"): continue  # read did not align
        if (minMapQ != None) and (a.mapQ < minMapQ): continue

        alignmentNum += 1
        events = sam_to_events(a)
        if (type(events) == str):
            print >> stderr, events
            continue
        (nMatch, nMismatch, nInsO, nInsX, nDelO, nDelX) = events

        if (writeHeader):
            print "\t".join(
                ["line", "read", "mRatio", "m", "mm", "io", "ix", "do", "dx"])
            writeHeader = False

        if (writeWhat == "per alignment"):
            mRatio = float(nMatch) / (nMatch + nMismatch + nInsO + nInsX +
                                      nDelO + nDelX)
            mRatio = "%.3f" % mRatio
            vec = [
                a.lineNumber, a.qName, mRatio, nMatch, nMismatch, nInsO, nInsX,
                nDelO, nDelX
            ]
            print "\t".join(map(str, vec))

        sum["m"] += nMatch
        sum["mm"] += nMismatch
        sum["io"] += nInsO
        sum["ix"] += nInsX
        sum["do"] += nDelO
        sum["dx"] += nDelX

    sum["events"] = (sum["m"] + sum["mm"] + sum["io"] + sum["ix"] + sum["do"] +
                     sum["dx"])

    if (alignmentNum == 0):
        print >> stderr, "WARNING: input contained no alignments"
    elif (writeWhat == "sum only"):
        alignmentNumStr = "(%d)" % alignmentNum
        mRatio = float(sum["m"]) / sum["events"]
        mRatio = "%.3f" % mRatio
        vec = [
            "all", alignmentNumStr, mRatio, sum["m"], sum["mm"], sum["io"],
            sum["ix"], sum["do"], sum["dx"]
        ]
        print "\t".join(map(str, vec))
Exemplo n.º 12
0
def main():
	global debug

	# parse the command line

	maxMRatio    = 0.85
	minColumns   = 10
	headLimit    = None
	reportClumps = False
	requireEof   = True
	debug        = []

	for arg in argv[1:]:
		if ("=" in arg):
			argVal = arg.split("=",1)[1]

		if (arg.startswith("--maxMRatio=")):
			maxMRatio = parse_noise_rate(argVal)
			if (not (0.0 <= minMRatio <= 1.0)):
				exit("%s: mratio has to be between 0 and 1 (e.g. 0.85 or 85%%)\n%s"
				   % (os_path.basename(argv[0]),arg))
		elif (arg.startswith("--minnoise=")):
			maxMRatio = 1 - parse_noise_rate(argVal)
			if (not (0.0 <= minMRatio <= 1.0)):
				exit("%s: noise has to be between 0 and 1 (e.g. 0.15 or 15%%)\n%s"
				   % (os_path.basename(argv[0]),arg))
		elif (arg.startswith("--mincolumns=")) or (arg.startswith("--mindenom=")):
			minColumns =int(argVal)
			if (minColumns < 2):
				usage("minimum length has to be at least two columns\n%s" % arg)
		elif (arg.startswith("--head=")):
			headLimit = int_with_unit(argVal)
		elif (arg == "--report:clumps") or (arg == "--report=clumps"):
			reportClumps = True
		elif (arg in ["--noendmark","--noeof","--nomark"]):   # (unadvertised)
			requireEof = False
		elif (arg == "--debug"):
			debug += ["debug"]
		elif (arg.startswith("--debug=")):
			debug += argVal.split(",")
		elif (arg.startswith("--")):
			usage("unrecognized option: %s" % arg)
		else:
			usage("unrecognized option: %s" % arg)

	# process the alignments

	alignmentNum = 0
	for a in alignments(stdin,requireEof):
		alignmentNum +=1 

		if (headLimit != None) and (alignmentNum > headLimit):
			print >>stderr, "limit of %d alignments reached" % headLimit
			break

		if (a.errorText == None):
			exit("%s: alignment at line %d doesn't include error text"
			   % (os_path.basename(argv[0]),a.lineNumber))

		if ("detail" in debug):
			print >>stderr, "\nlooking for clumps in %s %c %u-%u" \
			              % (a.seqName,a.strand,a.start,a.end)

		clumps = find_clumps(a.errorText,1-maxMRatio,minColumns,
		                     positiveCh='x',negativeCh='=')

		clumpText = ["-"] * len(a.errorText)
		for (start,end) in clumps:
			for ix in xrange(start,end): clumpText[ix] = "*"
		clumpText = "".join(clumpText)

		prefixLen = 1 + a.lines[0].find(" =")
		if (prefixLen < 0):
			prefixLen = 1 + a.lines[0].find(" x")

		if (alignmentNum > 1): print
		a.lines.insert(3,"# %-*s%s" % (prefixLen-2,"noise clumps",clumpText))
		print a

		if (reportClumps):
			for (start,end) in clumps:
				errorCount = matchCount = 0
				for ch in a.errorText[start:end]:
					if   (ch == 'x'): errorCount += 1
					elif (ch == '='): matchCount += 1
				print >>stderr, "line %d (%d,%d) m=%s x=%s mRatio: %.2f%%" \
							  % (a.lineNumber,
							     start,end,matchCount,errorCount,
								 (100.0*matchCount)/(matchCount+errorCount))

	if (requireEof):
		print "# ncrf end-of-file"
Exemplo n.º 13
0
def main():
    global headLimit, reportProgress, requireEof
    global winnerThreshold, filterToKeep, reportConsensus, reportMsa
    global canonicalizeConsensuses
    global debug

    canonicalizeConsensuses = True

    # parse the command line

    filterToKeep = "consensus"
    nameToMotif = {}
    motifsOfInterest = []
    reportConsensus = False
    reportMsa = False
    winnerThreshold = 0.50  # (see derive_consensuses)
    sliceWidth = None
    sliceStep = None
    headLimit = None
    reportProgress = None
    requireEof = True
    debug = []

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg == "--nonconsensus"):  # (unadvertised)
            filterToKeep = "non consensus"
            reportMsa = False
            reportConsensus = True
        elif (arg == "--nonconsensus,msa"):  # (unadvertised)
            filterToKeep = "non consensus"
            reportMsa = True
            reportConsensus = True
        elif (arg == "--consensusonly"):
            filterToKeep = "no filter"
            reportMsa = False
            reportConsensus = True
        elif (arg == "--filter,consensus"):  # (unadvertised)
            filterToKeep = "consensus"
            reportMsa = False
            reportConsensus = True
        elif (arg == "--msa"):  # (unadvertised)
            filterToKeep = "no filter"
            reportMsa = True
            reportConsensus = True
        elif (arg.startswith("--winner=")) or (
                arg.startswith("W=")):  # (unadvertised)
            winnerThreshold = parse_probability(argVal)
        elif (arg.startswith("--slice=")):  # (unadvertised)
            if ("by" in argVal):
                (sliceWidth, sliceStep) = map(int_with_unit,
                                              argVal.split("by", 1))
            else:
                sliceWidth = sliceStep = int_with_unit(argVal)
        elif (arg.startswith("--head=")):
            headLimit = int_with_unit(argVal)
        elif (arg.startswith("--progress=")):
            reportProgress = int_with_unit(argVal)
        elif (arg in ["--noendmark", "--noeof", "--nomark"]):  # (unadvertised)
            requireEof = False
        elif (arg == "--debug"):
            debug += ["debug"]
        elif (arg.startswith("--debug=")):
            debug += argVal.split(",")
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        elif (":" in arg):
            (name, motif) = arg.split(":", 1)
            if (name in nameToMotif) and (nameToMotif[name] != motif):
                usage("\"%s\" is given for more than one motif" % name)
            if (name not in nameToMotif):
                nameToMotif[name] = motif
                motifsOfInterest += [motif]
        else:
            motifsOfInterest += [arg]

    if (motifsOfInterest == []):
        motifsOfInterest = None  # this really means all motifs are of interest
    else:
        motifsOfInterest = set(motifsOfInterest)

    # process the alignments

    if (sliceWidth == None):
        simple_consensus_filter(stdin, motifsOfInterest, nameToMotif)
    else:
        sliced_consensus_filter(stdin, motifsOfInterest, nameToMotif,
                                sliceWidth, sliceStep)
Exemplo n.º 14
0
def main():
    global reportProgress, batchSize
    global debug

    # parse the command line

    testMethod = "min-max"
    numTrials = 10 * 1000  # (only used for testMethod == "min-max")
    numNeededToPass = 1  # (only used for testMethod == "min-max")
    effectSize = 0.3  # (only used for testMethod == "chi-square")
    power = 0.8  # (only used for testMethod == "chi-square")
    discardWhich = "bad"
    testWhich = "matches-insertions"
    warnOnUntested = False
    subsampleK = None
    subsampleN = None
    headLimit = None
    batchSize = None  # (will be replace by method-specific result)
    reportAs = "ncrf"
    requireEof = True
    prngSeed = defaultPrngSeed
    reportProgress = None
    debug = []

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg == "--method=min-max"):
            testMethod = "min-max"
        elif (arg.startswith("--trials=")):
            if ("/" in argVal):
                (numNeededToPass, numTrials) = map(int_with_unit,
                                                   argVal.split("/", 1))
                if (numTrials < 1):
                    usage("bad value in: %s (trials must be at least 1)" % arg)
                if (not 1 <= numNeededToPass <= numTrials):
                    usage(
                        "bad value in: %s (num-in-bounds must be in range 1..trials)"
                        % arg)
            else:
                (numNeededToPass, numTrials) = (1, int_with_unit(argVal))
                if (numTrials < 1):
                    usage("bad value in: %s (trials must be at least 1)" % arg)
        elif (arg in ["--method=chi-squared",
                      "--method=chi-square"]):  # (unadvertised, see [4])
            testMethod = "chi-squared"
        elif (arg.startswith("--effectsize=")):  # (unadvertised, see [4])
            effectSize = parse_probability(argVal)
        elif (arg.startswith("--power=")):  # (unadvertised, see [4])
            power = parse_probability(argVal)
        elif (arg in ["--discard:bad", "--discard=bad"]):
            discardWhich = "bad"
        elif (arg in ["--discard:good", "--discard=good"]):
            discardWhich = "good"
        elif (arg in ["--discard:none", "--discard=none"]):
            discardWhich = "none"
        elif (arg in [
                "--test:matches-insertions", "--test=matches-insertions",
                "--test:m-i", "--test=m-i"
        ]):
            testWhich = "matches-insertions"
        elif (arg in ["--test:matches", "--test=matches"]):
            testWhich = "matches"
        elif (arg in ["--test:errors", "--test=errors"]):
            testWhich = "errors"
        elif (arg == "--warn:untested") or (arg == "--warn=matrix"):
            warnOnUntested = True
        elif (arg.startswith("--head=")):
            headLimit = int_with_unit(argVal)
        elif (arg.startswith("--subsample=")):
            (subsampleK, subsampleN) = map(int, argVal.split("/", 2))
            if (not 0 < subsampleK <= subsampleN):
                usage("bad subsample description in %s" % arg)
        elif (arg.startswith("--progress=")):
            reportProgress = int_with_unit(argVal)
        elif (arg.startswith("--batch=")
              ):  # (no longer advertised, since it only applies to R)
            batchSize = int(argVal)
        elif (arg == "--report:matrix") or (
                arg == "--report=matrix"):  # (unadvertised)
            reportAs = "matrix"
        elif (arg == "--report:silent") or (
                arg == "--report=silent"):  # (unadvertised)
            reportAs = "silent"
        elif (arg in ["--noendmark", "--noeof", "--nomark"]):  # (unadvertised)
            requireEof = False
        elif (arg.startswith("--seed=")):
            seed = argVal
            if (seed in ["none", "None", "NONE"]):
                prngSeed = None
            elif (seed in ["default", "Default", "DEFAULT"]):
                prngSeed = defaultPrngSeed
            else:
                # nota bene: if the seed is a number, use it as a number, since
                #            string seeds can produce different sequences on
                #            different versions/builds of python
                try:
                    seed = int(seed)
                except ValueError:
                    try:
                        seed = float(seed)
                    except ValueError:
                        pass
                prngSeed = seed
        elif (arg == "--debug"):
            debug += ["debug"]
        elif (arg.startswith("--debug=")):
            debug += argVal.split(",")
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            usage("unrecognized option: %s" % arg)

    if (reportAs in ["matrix", "silent"]):
        discardWhich = "none"

    if (testMethod == "chi-squared"):
        testDescription = "positional chi-squared"
        if (batchSize == None): batchSize = 30
    elif (testMethod == "min-max"):
        testDescription = "positional min-max"
        if (batchSize == None): batchSize = 1
    else:
        exit("%s: internal error: unrecognized test method: \"%s\"" %
             (os_path.basename(argv[0]), testMethod))

    # initialize the PRNG, if needed

    if (testMethod == "min-max"):
        if (prngSeed != None):
            random_seed(prngSeed)
    else:
        if (prngSeed not in [None, defaultPrngSeed]):
            print >>stderr, "WARNING: ignoring request to use PRNG with \"%s\"" \
                          % testMethod

    # make sure the shell commands we're gonna use have been installed

    if (testMethod == "chi-squared"):
        if (not shell_command_exists("Rscript")):
            exit((
                "%s: Unable to run the shell command \"Rscript\";" +
                "\n  .. Either R hasn't been installed, or the command-line shell"
                + " can't find it.") % os_path.basename(argv[0]))

    # collect the alignments; we need to collect the positional info for all
    # alignments, to feed to R in batches (doing them one-by-one was incredibly
    # slow); hopefully this won't become a memory problem

    (unitLength,alignmentList,mxMatrix) \
      = collect_alignments(stdin,testWhich,
                           headLimit=headLimit,
                           subsampleK=subsampleK,subsampleN=subsampleN,
                           requireEof=requireEof)

    numAlignments = len(alignmentList)
    if (reportProgress != None):
        print >>stderr, "progress: read %s alignments" \
                      % (commatize(numAlignments))

    # assess the alignments, batch-by-batch

    if (reportProgress != None):
        progressReported = -1

    accepted = []
    outcomeCount = {True: 0, False: 0, None: 0}
    for batchStartIx in xrange(0, numAlignments, batchSize):
        alignmentsTested = batchStartIx
        if (reportProgress != None):
            rBlock = (progressReported + 1) / reportProgress
            aBlock = (alignmentsTested + 1) / reportProgress
            if (alignmentsTested == 0) or (aBlock != rBlock):
                print >>stderr, "progress: testing alignment %s (%d uniform, %d non-uniform, %d untested)" \
                              % (commatize(1+alignmentsTested),
                                 outcomeCount[True],
                                 outcomeCount[False],
                                 outcomeCount[None])
                progressReported = alignmentsTested

        batchEndIx = min(batchStartIx + batchSize, numAlignments)
        if ("batch" in debug):
            print >>stderr, "using R for alignments %d thru %d" \
                          % (batchStartIx+1,batchEndIx)

        mxBatch = mxMatrix[batchStartIx:batchEndIx]
        aBatch = alignmentList[batchStartIx:batchEndIx]

        if (testMethod == "chi-squared"):
            batchResult = mx_significance_tests(mxBatch, testWhich, effectSize,
                                                power)
            if (type(batchResult) == str):
                exit(("%s: internal error: having trouble with R" +
                      " (with alignment batch %d..%d)" +
                      "\nHere's what R reported:\n%s") % (os_path.basename(
                          argv[0]), batchStartIx, batchEndIx, batchResult))
        else:  # if (testMethod == "min-max"):
            batchResult = min_max_tests(aBatch, mxBatch, batchStartIx,
                                        testWhich, numTrials, numNeededToPass)
            if (type(batchResult) == str):
                exit(("%s: internal error: having trouble with min-max test" +
                      " (with alignment batch %d..%d)" +
                      "\nHere's what was reported:\n%s") % (os_path.basename(
                          argv[0]), batchStartIx, batchEndIx, batchResult))

        if (len(batchResult) != batchEndIx - batchStartIx):
            exit((
                "%s: internal error: number of test outcomes reported by R (%d)"
                + "\n  .. doesn't match the number of tests given to R (%d)") %
                 (os_path.basename(
                     argv[0]), len(batchResult), batchEndIx - batchStartIx))
        accepted += batchResult

        if (warnOnUntested):
            for alignmentNum in xrange(batchStartIx, batchEndIx):
                testOutcome = accepted[alignmentNum]
                if (testOutcome == None):
                    print >>stderr, "WARNING: alignment number %d (at line %d) could not be tested" \
                                  % (alignmentNum,1+alignmentList[alignmentNum].lineNumber)

        for alignmentNum in xrange(batchStartIx, batchEndIx):
            testOutcome = accepted[alignmentNum]
            outcomeCount[testOutcome] += 1

    # process the alignments and their assessments
    # $$$ untested alignments should be processed by some other test -- for
    #     example (if we're testing by error counts), a perfect alignment
    #     currently gets discarded because it can't be tested

    if (reportAs in ["matrix", "silent"]):
        outcomeMapping = {
            True: "not_rejected",
            False: "rejected",
            None: "untested"
        }
    else:  # if (reportAs == "ncrf"):
        if (testWhich == "matches-insertions"):
            outcomeMapping = {
                True: "match-insert uniformity not rejected",
                False: "match-insert uniformity rejected",
                None: "untested"
            }
        elif (testWhich == "errors"):
            outcomeMapping = {
                True: "error uniformity not rejected",
                False: "error uniformity rejected",
                None: "untested"
            }
        else:  # if (testWhich == "matches"):
            outcomeMapping = {
                True: "match uniformity not rejected",
                False: "match uniformity rejected",
                None: "untested"
            }

    outcomeNameW = max(
        [len(outcomeMapping[testOutcome]) for testOutcome in outcomeMapping])
    for testOutcome in [True, False, None]:
        outcomeName = outcomeMapping[testOutcome]
        count = outcomeCount[testOutcome]
        reportStr = "%-*s %d" % (outcomeNameW + 1, "%s:" % outcomeName, count)
        if (numAlignments > 0):
            reportStr += " (%.2f%%)" % (100.0 * count / numAlignments)
        print >> stderr, reportStr

    if (reportAs == "matrix"):
        # see note [3] above for the format of the matrix file
        for (alignmentNum, a) in enumerate(alignmentList):
            testOutcome = accepted[alignmentNum]
            vec = [a.lineNumber, outcomeMapping[testOutcome]
                   ] + mxMatrix[alignmentNum]
            print "\t".join(map(str, vec))
    elif (reportAs == "silent"):
        pass
    else:  # if (reportAs == "ncrf"):
        numKept = 0
        isFirst = True
        for (alignmentNum, a) in enumerate(alignmentList):
            testOutcome = accepted[alignmentNum]
            if (discardWhich == "good"):
                if (testOutcome == True): continue
            elif (discardWhich == "bad"):
                if (testOutcome != True): continue

            if (discardWhich == "none"):
                testInfo = "# %s: %s" % (testDescription,
                                         outcomeMapping[testOutcome])
                (startIx, endIx) = a.positional_stats_indexes()
                a.lines.insert(endIx, testInfo)

            if (isFirst): isFirst = False
            else: print
            print a
            numKept += 1

        reportStr = "kept %d of %d alignments" % (numKept, numAlignments)
        if (numAlignments > 0):
            reportStr += ", %.2f%%" % (100.0 * numKept / numAlignments)
        print >> stderr, reportStr

        if (requireEof):
            print "# ncrf end-of-file"