def main(args): args = parse_args(args) G = DeBruijnGraph(args.kmer_size) # first pass adds nodes for name, seq in fastaRead(args.reference): name, offset = name.split("_")[:2] G.constructNodes(name, offset, seq) # second pass constructs adjacenices for name, seq in fastaRead(args.reference): G.constructAdjacencies(seq) for name, seq in fastaRead(args.normalizing): G.addNormalizing(name, seq) if args.bad_kmers is not None: G.flagNodes(args.bad_kmers) if args.weights is not None: with open(args.weights) as f: G.weightKmers(pickle.load(f)) G.finishBuild() G.pruneGraph() pickle.dump(G, args.out)
def getFastaDictionary(fastaFile): """Returns a dictionary of the first words of fasta headers to their corresponding fasta sequence """ names = map(lambda x: x[0].split()[0], fastaRead(open(fastaFile, 'r'))) assert len(names) == len(set(names)) #Check all the names are unique return dict( map(lambda x: (x[0].split()[0], x[1]), fastaRead(open(fastaFile, 'r')))) #Hash of names to sequences
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, referenceSequence, querySequenceFile, outputCigarFile, options): #Temporary files tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa") #Write the temporary reference file. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) #For each cigar string for exonerateCigarString, (querySequenceName, querySequence) in \ zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)): fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cPecanRealign loadHmm = nameValue("loadHmm", options.hmmFile) try: command = "echo %s | cPecanRealign %s %s --diagonalExpansion=10 \ --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, options.gapGamma, options.matchGamma, outputCigarFile); system(command) # target.logToMaster('[good] ' + command + '\n'); except Exception, e: target.logToMaster('Caught an exception! qname = "%s"\n' % querySequenceName); target.logToMaster('len(exonerateCigarString[:-1]) = %d\n' % (len(exonerateCigarString[:-1]))); target.logToMaster('[bad] Command that caused the exception:\n'); target.logToMaster("echo %s | cPecanRealign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, options.gapGamma, options.matchGamma, outputCigarFile)); target.logToMaster('\n'); target.logToMaster('\n'); target.logToMaster(str(e) + '\n'); target.logToMaster('\n'); continue;
def getFastaDictionary(fastaFile): """Returns a dictionary of the first words of fasta headers to their corresponding fasta sequence """ namesAndSequences = map(lambda x : (x[0].split()[0], x[1]), fastaRead(open(fastaFile, 'r'))) names = map(lambda x : x[0], namesAndSequences) assert len(names) == len(set(names)) #Check all the names are unique return dict(namesAndSequences) #Hash of names to sequences
def posteriorProbabilityCalculationTargetFn(target, exonerateCigarStringFile, referenceSequenceName, referenceSequence, querySequenceFile, outputPosteriorProbsFile, options): """Calculates the posterior probabilities of matches in a set of pairwise alignments between a reference sequence and a set of reads. """ #Temporary files tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa") #Write the temporary reference file. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) #Hash to store posterior probabilities in expectationsOfBasesAtEachPosition = {} #For each cigar string for exonerateCigarString, (querySequenceName, querySequence) in \ zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)): fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cPecanRealign tempPosteriorProbsFile = os.path.join(target.getLocalTempDir(), "posteriorProbs.txt") if options.noMargin: #When we don't marginialize we just run cPecanRealign to get the list of aligned pairs #This runtime should be very fast system("echo %s | cPecanRealign %s %s --diagonalExpansion=0 \ --splitMatrixBiggerThanThis=1 --rescoreOriginalAlignment --outputPosteriorProbs=%s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, tempPosteriorProbsFile)) else: system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \ --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, tempPosteriorProbsFile, options.alignmentModel)) #Now collate the reference position expectations for refPosition, queryPosition, posteriorProb in \ map(lambda x : map(float, x.split()), open(tempPosteriorProbsFile, 'r')): assert posteriorProb <= 1.01 assert posteriorProb >= 0.0 key = (referenceSequenceName, int(refPosition)) if key not in expectationsOfBasesAtEachPosition: expectationsOfBasesAtEachPosition[key] = dict( zip(BASES, [0.0] * len(BASES))) queryBase = querySequence[int(queryPosition)].upper() if queryBase in BASES: #Could be an N or other wildcard character, which we ignore expectationsOfBasesAtEachPosition[key][ queryBase] += 1.0 if options.noMargin else posteriorProb #Pickle the posterior probs fileHandle = open(outputPosteriorProbsFile, 'w') cPickle.dump(expectationsOfBasesAtEachPosition, fileHandle, cPickle.HIGHEST_PROTOCOL) fileHandle.close()
def merge(target, files, outputDir): """merges all muscle output into one fasta and runs metrics() on each""" for typeof in files: outmetrics = open(os.path.join(outputDir, typeof + "_metrics.tsv"), "w") outmetrics.write("Read\tReference\tMatches\tMismatches\tReadDeletionLength\tReadInsertionLength\tIdentity\tReferenceCoverage\n") for f in files[typeof]: handle = fastaRead(f) name, seq = handle.next() ref_name, ref_seq = handle.next() name = name.lstrip(">"); ref_name = ref_name.lstrip(">") outmetrics.write("\t".join([name, ref_name] + metrics(seq, ref_seq))); outmetrics.write("\n") outmetrics.close()
def run(self): refSequences = dict(fastaRead(open(self.referenceFastaFile, 'r'))) #Hash of names to sequences readSequences = readSequences = dict([ (name, seq) for name, seq, quals in fastqRead(self.readFastqFile) ]) #Hash of names to sequences sam = pysam.Samfile(self.samFile, "r" ) overallIndelCounter = IndelCounter("overall", "overall") for aR in sam: #Iterate on the sam lines refSeq = refSequences[sam.getrname(aR.rname)] readSeq = readSequences[aR.qname] overallIndelCounter.addReadAlignment(aR, refSeq, readSeq) sam.close() #Write out the substitution info open(os.path.join(self.outputDir, "indels.xml"), 'w').write(prettyXml(overallIndelCounter.getXML()))
def makeFastaSequenceNamesUnique(inputFastaFile, outputFastaFile): """Makes a fasta file with unique names """ names = set() fileHandle = open(outputFastaFile, 'w') for name, seq in fastaRead(open(inputFastaFile, 'r')): while name in names: logger.critical("Got a duplicate fasta sequence name: %s" % name) name += "i" names.add(name) fastaWrite(fileHandle, name, seq) fileHandle.close() return outputFastaFile
def posteriorProbabilityCalculationTargetFn(target, exonerateCigarStringFile, referenceSequenceName, referenceSequence, querySequenceFile, outputPosteriorProbsFile, options): """Calculates the posterior probabilities of matches in a set of pairwise alignments between a reference sequence and a set of reads. """ #Temporary files tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa") #Write the temporary reference file. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) #Hash to store posterior probabilities in expectationsOfBasesAtEachPosition = {} #For each cigar string for exonerateCigarString, (querySequenceName, querySequence) in \ zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)): fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cPecanRealign tempPosteriorProbsFile = os.path.join(target.getLocalTempDir(), "posteriorProbs.txt") if options.noMargin: #When we don't marginialize we just run cPecanRealign to get the list of aligned pairs #This runtime should be very fast system("echo %s | cPecanRealign %s %s --diagonalExpansion=0 \ --splitMatrixBiggerThanThis=1 --rescoreOriginalAlignment --outputPosteriorProbs=%s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, tempPosteriorProbsFile)) else: system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \ --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, tempPosteriorProbsFile, options.alignmentModel)) #Now collate the reference position expectations for refPosition, queryPosition, posteriorProb in \ map(lambda x : map(float, x.split()), open(tempPosteriorProbsFile, 'r')): assert posteriorProb <= 1.01 assert posteriorProb >= 0.0 key = (referenceSequenceName, int(refPosition)) if key not in expectationsOfBasesAtEachPosition: expectationsOfBasesAtEachPosition[key] = dict(zip(BASES, [0.0]*len(BASES))) queryBase = querySequence[int(queryPosition)].upper() if queryBase in BASES: #Could be an N or other wildcard character, which we ignore expectationsOfBasesAtEachPosition[key][queryBase] += 1.0 if options.noMargin else posteriorProb #Pickle the posterior probs fileHandle = open(outputPosteriorProbsFile, 'w') cPickle.dump(expectationsOfBasesAtEachPosition, fileHandle, cPickle.HIGHEST_PROTOCOL) fileHandle.close()
def merge(target, files, outputDir): """merges all muscle output into one fasta and runs metrics() on each""" for typeof in files: outmetrics = open(os.path.join(outputDir, typeof + "_metrics.tsv"), "w") outmetrics.write( "Read\tReference\tMatches\tMismatches\tReadDeletionLength\tReadInsertionLength\tIdentity\tReferenceCoverage\n" ) for f in files[typeof]: handle = fastaRead(f) name, seq = handle.next() ref_name, ref_seq = handle.next() name = name.lstrip(">") ref_name = ref_name.lstrip(">") outmetrics.write("\t".join([name, ref_name] + metrics(seq, ref_seq))) outmetrics.write("\n") outmetrics.close()
def run(self): refSequences = dict(fastaRead(open(self.referenceFastaFile, 'r'))) #Hash of names to sequences readSequences = dict([ (name, seq) for name, seq, quals in fastqRead(self.readFastqFile) ]) #Hash of names to sequences overallCoverageCounter = CoverageCounter("overall", "overall") #Thing to store the overall coverage in readCoverages = [] sam = pysam.Samfile(self.samFile, "r" ) for aR in sam: #Iterate on the sam lines refSeq = refSequences[sam.getrname(aR.rname)] readSeq = readSequences[aR.qname] overallCoverageCounter.addReadAlignment(aR, refSeq, readSeq) readCoverages.append(CoverageCounter(aR.qname, sam.getrname(aR.rname))) readCoverages[-1].addReadAlignment(aR, refSeq, readSeq) sam.close() #Write out the coverage info parentNode = overallCoverageCounter.getXML() for readCoverage in readCoverages: parentNode.append(readCoverage.getXML()) open(os.path.join(self.outputDir, "coverages.xml"), 'w').write(prettyXml(parentNode))
def countKmers(self): refKmers, readKmers = Counter(), Counter() for name, seq in fastaRead(self.referenceFastaFile): for i in xrange(self.kmerSize, len(seq)): s = seq[ i - self.kmerSize : i ] if "N" not in s: refKmers[s] += 1 refKmers[reverseComplement(s)] += 1 for name, seq, qual in fastqRead(self.readFastqFile): for i in xrange(self.kmerSize, len(seq)): s = seq[ i - self.kmerSize : i ] if "N" not in s: readKmers[s] += 1 readKmers[reverseComplement(s)] += 1 return (refKmers, readKmers)
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, referenceSequence, querySequenceFile, outputCigarFile, options): #Temporary files tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa") #Write the temporary reference file. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) #For each cigar string for exonerateCigarString, (querySequenceName, querySequence) in \ zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)): fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cPecanRealign loadHmm = nameValue("loadHmm", options.hmmFile) system("echo \"%s\" | cPecanRealign %s %s --diagonalExpansion=10 \ --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, options.gapGamma, options.matchGamma, outputCigarFile))
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, referenceSequence, querySequenceFile, outputCigarFile, options): #Temporary files tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa") #Write the temporary reference file. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) #For each cigar string for exonerateCigarString, (querySequenceName, querySequence) in \ zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)): fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cPecanRealign loadHmm = nameValue("loadHmm", options.hmmFile) system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \ --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, options.gapGamma, options.matchGamma, outputCigarFile))
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) outputDir = "muscle_compare_2d/output/" if not os.path.exists(outputDir): logger.info("Output dir {} does not exist. Creating.") os.mkdir(outputDir) if len(os.listdir(outputDir)) > 0: logger.info("Output dir not empty.") if len(args) != 3: raise RuntimeError("Error: expected three arguments got %s arguments: %s" % (len(args), " ".join(args))) templateRecords = {x.qname for x in pysam.Samfile(args[0]) if not x.is_unmapped} complementRecords = {x.qname for x in pysam.Samfile(args[1]) if not x.is_unmapped} twodSamFile = pysam.Samfile(args[2]) twodRecords = {x.qname : x for x in twodSamFile if not x.is_unmapped} recordsToAnalyze = dict() for name, record in twodRecords.iteritems(): if name not in templateRecords and name not in complementRecords: ref_name = twodSamFile.getrname(record.tid) ref_start, ref_stop = int(record.aend - record.alen), int(record.aend) recordsToAnalyze[name] = [ref_name, ref_start, ref_stop] if os.path.exists("../readFastqFiles/template/") and os.path.exists("../readFastqFiles/complement"): templateFastqFiles = [os.path.join("../readFastqFiles/template/", x) for x in os.listdir("../readFastqFiles/template/") if x.endswith(".fastq") or x.endswith(".fq")] complementFastqFiles = [os.path.join("../readFastqFiles/complement/", x) for x in os.listdir("../readFastqFiles/complement/") if x.endswith(".fastq") or x.endswith(".fq")] else: raise RuntimeError("Error: readFastqFiles does not contain template and/or complement folders") referenceFastaFiles = [os.path.join("../referenceFastaFiles", x) for x in os.listdir("../referenceFastaFiles") if x.endswith(".fa") or x.endswith(".fasta")] if len(referenceFastaFiles) > 0: references = { y[0].split(" ")[0] : y[1] for x in referenceFastaFiles for y in fastaRead(x) } else: raise RuntimeError("Error: no reference fasta files") if len(recordsToAnalyze) == 0: raise RuntimeError("Error: none of the mappable twoD reads in this set did not map as template/complement.") logger.info("Starting to find analyses to run...") args = (recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir) i = Stack(Target.makeTargetFn(find_analyses, args=args)).startJobTree(options) if i != 0: raise RuntimeError("Got {} failed jobs".format(i))
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) outputDir = "muscle_compare_2d/output/" if not os.path.exists(outputDir): logger.info("Output dir {} does not exist. Creating.") os.mkdir(outputDir) if len(os.listdir(outputDir)) > 0: logger.info("Output dir not empty.") if len(args) != 3: raise RuntimeError( "Error: expected three arguments got %s arguments: %s" % (len(args), " ".join(args))) templateRecords = { x.qname for x in pysam.Samfile(args[0]) if not x.is_unmapped } complementRecords = { x.qname for x in pysam.Samfile(args[1]) if not x.is_unmapped } twodSamFile = pysam.Samfile(args[2]) twodRecords = {x.qname: x for x in twodSamFile if not x.is_unmapped} recordsToAnalyze = dict() for name, record in twodRecords.iteritems(): if name not in templateRecords and name not in complementRecords: ref_name = twodSamFile.getrname(record.tid) ref_start, ref_stop = int(record.aend - record.alen), int( record.aend) recordsToAnalyze[name] = [ref_name, ref_start, ref_stop] if os.path.exists("../readFastqFiles/template/") and os.path.exists( "../readFastqFiles/complement"): templateFastqFiles = [ os.path.join("../readFastqFiles/template/", x) for x in os.listdir("../readFastqFiles/template/") if x.endswith(".fastq") or x.endswith(".fq") ] complementFastqFiles = [ os.path.join("../readFastqFiles/complement/", x) for x in os.listdir("../readFastqFiles/complement/") if x.endswith(".fastq") or x.endswith(".fq") ] else: raise RuntimeError( "Error: readFastqFiles does not contain template and/or complement folders" ) referenceFastaFiles = [ os.path.join("../referenceFastaFiles", x) for x in os.listdir("../referenceFastaFiles") if x.endswith(".fa") or x.endswith(".fasta") ] if len(referenceFastaFiles) > 0: references = { y[0].split(" ")[0]: y[1] for x in referenceFastaFiles for y in fastaRead(x) } else: raise RuntimeError("Error: no reference fasta files") if len(recordsToAnalyze) == 0: raise RuntimeError( "Error: none of the mappable twoD reads in this set did not map as template/complement." ) logger.info("Starting to find analyses to run...") args = (recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir) i = Stack(Target.makeTargetFn(find_analyses, args=args)).startJobTree(options) if i != 0: raise RuntimeError("Got {} failed jobs".format(i))