def analyzeCounts(self, refKmers, readKmers, name): refSize, readSize = sum(refKmers.values()), sum(readKmers.values()) outf = open(os.path.join(self.outputDir, name + "kmer_counts.txt"), "w") outf.write( "kmer\trefCount\trefFraction\treadCount\treadFraction\tlogFoldChange\n" ) if refSize > 0 and readSize > 0: for kmer in itertools.product("ATGC", repeat=5): refFraction, readFraction = 1.0 * refKmers[ kmer] / refSize, 1.0 * readKmers[kmer] / readSize if refFraction == 0: foldChange = "-Inf" elif readFraction == 0: foldChange = "Inf" else: foldChange = -log(readFraction / refFraction) outf.write("\t".join( map(str, [ "".join(kmer), refKmers[kmer], refFraction, readKmers[kmer], readFraction, foldChange ])) + "\n") outf.close() system("Rscript nanopore/analyses/kmer_analysis.R {} {} {} {} {}". format( os.path.join(self.outputDir, name + "kmer_counts.txt"), os.path.join(self.outputDir, name + "pval_kmer_counts.txt"), os.path.join(self.outputDir, name + "top_bot_sigkmer_counts.txt"), os.path.join(self.outputDir, name + "volcano_plot.pdf"), "Indel_Kmer"))
def testScriptTree_Example(self): """Uses the jobTreeTest code to test the scriptTree Target wrapper. """ for test in xrange(self.testNo): command = "scriptTreeTest_Wrapper.py --jobTree %s --logLevel=INFO --retryCount=10" % self.jobTreeDir system(command) runJobTreeStatusAndFailIfNotComplete(self.jobTreeDir)
def run(self): for readType in self.readTypes: sortedBaseMappers = [ x for x in sorted(self.baseMappers) if x != "Combined" ] outf = open( os.path.join(self.outputDir, readType + "_perReadMappability.tsv"), "w") outf.write("Read\tReadFastqFile\t") outf.write("\t".join(sortedBaseMappers)) outf.write("\n") for read in self.reads: if read.readType == readType: tmp = od([[x, 0] for x in sortedBaseMappers]) if read.is_mapped is True: for mapper, reference in read.get_map_ref_pair(): baseMapper = re.findall("[A-Z][a-z]*", mapper)[0] #hacky way to avoid including 'combined' analysis if baseMapper != "Combined" and tmp[ baseMapper] == 0: tmp[baseMapper] = 1 outf.write("\t".join( [read.name, os.path.basename(read.readFastqFile)] + map(str, tmp.values()))) outf.write("\n") outf.close() system("Rscript nanopore/metaAnalyses/vennDiagram.R {} {}".format( os.path.join(self.outputDir, readType + "_perReadMappability.tsv"), os.path.join(self.outputDir, readType + "_perReadMappabilityVennDiagram.pdf")))
def run(self): AbstractAnalysis.run(self) #Call base method to do some logging refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences sam = pysam.Samfile(self.samFile, "r" ) indelCounters = map(lambda aR : IndelCounter(sam.getrname(aR.rname), refSequences[sam.getrname(aR.rname)], aR.qname, readSequences[aR.qname], aR), samIterator(sam)) #Iterate on the sam lines sam.close() #Write out the substitution info if len(indelCounters) > 0: indelXML = getAggregateIndelStats(indelCounters) open(os.path.join(self.outputDir, "indels.xml"), "w").write(prettyXml(indelXML)) tmp = open(os.path.join(self.outputDir, "indels.tsv"), "w") #build list of data as vectors data_list = [] var = ["readInsertionLengths", "readDeletionLengths", "ReadSequenceLengths", "NumberReadInsertions", "NumberReadDeletions", "MedianReadInsertionLengths", "MedianReadDeletionLengths"] for x in var: data_list.append([x] + indelXML.attrib[x].split()) #transpose this list so R doesn't take hours to load it using magic data_list = map(None, *data_list) for line in data_list: tmp.write("\t".join(map(str,line))); tmp.write("\n") tmp.close() system("Rscript nanopore/analyses/indelPlots.R {} {}".format(os.path.join(self.outputDir, "indels.tsv"), os.path.join(self.outputDir, "indel_plots.pdf"))) self.finish() #Indicates the batch is done
def testScriptTree_Example2(self): """Tests that the global and local temp dirs of a job behave as expected. """ for test in xrange(self.testNo): command = "scriptTreeTest_Wrapper2.py --jobTree %s --logLevel=INFO --retryCount=0" % self.jobTreeDir system(command) runJobTreeStatusAndFailIfNotComplete(self.jobTreeDir)
def runJellyfish(localTempDir, countFile, fastqFile, uuid, kmerSize=49): """ Runs jellyfish. -C flag is set to count both strands together. """ jfFile = os.path.join(localTempDir, uuid + ".jf") system("jellyfish count -C -m {} -s 300M -o {} {}".format(kmerSize, jfFile, fastqFile)) system("jellyfish dump {} > {}".format(jfFile, countFile))
def run(self, globalAlignment=False): AbstractAnalysis.run(self) #Call base method to do some logging refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences sam = pysam.Samfile(self.samFile, "r" ) readsToReadCoverages = {} for aR in samIterator(sam): #Iterate on the sam lines refSeq = refSequences[sam.getrname(aR.rname)] readSeq = readSequences[aR.qname] readAlignmentCoverageCounter = ReadAlignmentCoverageCounter(aR.qname, readSeq, sam.getrname(aR.rname), refSeq, aR, globalAlignment) if aR.qname not in readsToReadCoverages: readsToReadCoverages[aR.qname] = [] readsToReadCoverages[aR.qname].append(readAlignmentCoverageCounter) sam.close() #Write out the coverage info for differing subsets of the read alignments if len(readsToReadCoverages.values()) > 0: for readCoverages, outputName in [ (reduce(lambda x, y : x + y, readsToReadCoverages.values()), "coverage_all"), (map(lambda x : max(x, key=lambda y : y.readCoverage()), readsToReadCoverages.values()), "coverage_bestPerRead") ]: parentNode = getAggregateCoverageStats(readCoverages, outputName, refSequences, readSequences, readsToReadCoverages, outputName) open(os.path.join(self.outputDir, outputName + ".xml"), 'w').write(prettyXml(parentNode)) #this is a ugly file format with each line being a different data type - column length is variable outf = open(os.path.join(self.outputDir, outputName + ".txt"), "w") outf.write("MappedReadLengths " + parentNode.get("mappedReadLengths") + "\n") outf.write("UnmappedReadLengths " + parentNode.get("unmappedReadLengths") + "\n") outf.write("ReadCoverage " + parentNode.get("distributionreadCoverage") + "\n") outf.write("MismatchesPerReadBase " + parentNode.get("distributionmismatchesPerReadBase") + "\n") outf.write("ReadIdentity " + parentNode.get("distributionidentity") + "\n") outf.write("InsertionsPerBase " + parentNode.get("distributioninsertionsPerReadBase") + "\n") outf.write("DeletionsPerBase " + parentNode.get("distributiondeletionsPerReadBase") + "\n") outf.close() system("Rscript nanopore/analyses/coverage_plot.R {} {}".format(os.path.join(self.outputDir, outputName + ".txt"), os.path.join(self.outputDir, outputName + ".pdf"))) self.finish()
def Substitutions(readFastqFile, referenceFastaFile, samFile, outputDir, kmer=6): """Calculates stats on substitutions """ refSequences = getFastaDictionary( referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary( readFastqFile) #Hash of names to sequences sM = SubstitutionMatrix() #The thing to store the counts in sam = pysam.Samfile(samFile, "r") for aR in samIterator(sam): #Iterate on the sam lines for aP in AlignedPair.iterator(aR, refSequences[sam.getrname( aR.rname)], readSequences[ aR.qname]): #Walk through the matches mismatches: sM.addAlignedPair(aP.getRefBase(), aP.getReadBase()) sam.close() #Write out the substitution info open(os.path.join(outputDir, "substitutions.xml"), 'w').write(prettyXml(sM.getXML())) bases = "ACGT" outf = open(os.path.join(outputDir, "subst.tsv"), "w") outf.write("A\tC\tG\tT\n") for x in bases: freqs = sM.getFreqs(x, bases) outf.write("{}\t{}\n".format(x, "\t".join(map(str, freqs)), "\n")) outf.close() analysis = str(samFile.split("/")[-1].split(".sam")[0]) system("Rscript scripts/substitution_plot.R {} {} {}".format( os.path.join(outputDir, "subst.tsv"), os.path.join(outputDir, "substitution_plot.pdf"), analysis))
def run(self, kmerSize=5): self.kmerSize = kmerSize for readType in self.readTypes: mappedKmers, unmappedKmers = Counter(), Counter() for read in self.reads: if read.readType == readType and read.is_mapped: mappedKmers += self.countKmers(read.seq) elif read.readType == readType: unmappedKmers += self.countKmers(read.seq) mappedSize, unmappedSize = sum(mappedKmers.values()), sum(unmappedKmers.values()) outf = open(os.path.join(self.getLocalTempDir(), readType + "_kmer_counts.txt"), "w") outf.write("kmer\tmappableCount\tmappableFraction\tunmappableCount\tunmappableFraction\tlogFoldChange\n") for kmer in itertools.product("ATGC",repeat=5): kmer = "".join(kmer) if mappedSize > 0: mappedFraction = 1.0 * mappedKmers[kmer] / mappedSize else: mappedFraction = 0 if unmappedSize > 0: unmappedFraction = 1.0 * unmappedKmers[kmer] / unmappedSize else: unmappedFraction = 0 if unmappedFraction == 0: foldChange = "-Inf" elif mappedFraction == 0: foldChange = "Inf" else: foldChange = -log(mappedFraction / unmappedFraction) outf.write("\t".join(map(str,[kmer, mappedKmers[kmer], mappedFraction, unmappedKmers[kmer], unmappedFraction, foldChange]))+"\n") outf.close() system("Rscript nanopore/metaAnalyses/mappable_kmer_analysis.R {} {} {} {}".format(os.path.join(self.getLocalTempDir(), readType + "_kmer_counts.txt"), os.path.join(self.outputDir, readType + "_unmapped_kmer_counts.txt"), os.path.join(self.outputDir, readType + "_unmapped_top_bot_sigkmer_counts.txt"), os.path.join(self.outputDir, readType + "_volcano_plot.pdf")))
def run(self): AbstractAnalysis.run(self) #Call base method to do some logging refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences sam = pysam.Samfile(self.samFile, "r" ) #The data we collect avgPosteriorMatchProbabilityInCigar = [] alignedPairsInCigar = [] posteriorMatchProbabilities = [] for aR in samIterator(sam): #Iterate on the sam lines #Exonerate format Cigar string cigarString = getExonerateCigarFormatString(aR, sam) #Temporary files tempCigarFile = os.path.join(self.getLocalTempDir(), "rescoredCigar.cig") tempRefFile = os.path.join(self.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(self.getLocalTempDir(), "read.fa") tempPosteriorProbsFile = os.path.join(self.getLocalTempDir(), "probs.tsv") #Write the temporary files. fastaWrite(tempRefFile, sam.getrname(aR.rname), refSequences[sam.getrname(aR.rname)]) fastaWrite(tempReadFile, aR.qname, aR.query) #Trained hmm file to use. hmmFile = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_0.txt") #Call to cactus_realign system("echo %s | cactus_realign %s %s --rescoreByPosteriorProbIgnoringGaps --rescoreOriginalAlignment --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputPosteriorProbs=%s --loadHmm=%s > %s" % \ (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile, tempCigarFile)) #Load the cigar and get the posterior prob assert len([ pA for pA in cigarRead(open(tempCigarFile)) ]) > 0 assert len([ pA for pA in cigarRead(open(tempCigarFile)) ]) == 1 pA = [ i for i in cigarRead(open(tempCigarFile)) ][0] avgPosteriorMatchProbabilityInCigar.append(pA.score) #Calculate the number of aligned pairs in the cigar alignedPairsInCigar.append(sum([ op.length for op in pA.operationList if op.type == PairwiseAlignment.PAIRWISE_MATCH ])) assert alignedPairsInCigar[-1] == len([ readPos for readPos, refPos in aR.aligned_pairs if readPos != None and refPos != None ]) #Get the posterior probs #posteriorMatchProbabilities += [ float(line.split()[2]) for line in open(tempPosteriorProbsFile) ] sam.close() #Write out the substitution info node = ET.Element("alignmentUncertainty", { "averagePosteriorMatchProbabilityPerRead":str(self.formatRatio(sum(avgPosteriorMatchProbabilityInCigar), len(avgPosteriorMatchProbabilityInCigar))), "averagePosteriorMatchProbability":str(self.formatRatio(float(sum([ avgMatchProb*alignedPairs for avgMatchProb, alignedPairs in zip(avgPosteriorMatchProbabilityInCigar, alignedPairsInCigar) ])),sum(alignedPairsInCigar))), "averagePosteriorMatchProbabilitesPerRead":",".join([ str(i) for i in avgPosteriorMatchProbabilityInCigar ]), "alignedPairsInCigar":",".join([ str(i) for i in alignedPairsInCigar ]) }) open(os.path.join(self.outputDir, "alignmentUncertainty.xml"), "w").write(prettyXml(node)) if len(avgPosteriorMatchProbabilityInCigar) > 0: outf = open(os.path.join(self.getLocalTempDir(), "tmp_uncertainty"), "w") outf.write("\t".join([ str(i) for i in avgPosteriorMatchProbabilityInCigar ])); outf.write("\n") outf.close() system("Rscript nanopore/analyses/match_hist.R {} {}".format(os.path.join(self.getLocalTempDir(), "tmp_uncertainty"), os.path.join(self.outputDir, "posterior_prob_hist.pdf"))) #Indicate everything is all done self.finish()
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, referenceSequence, querySequenceFile, outputCigarFile, options): #Temporary files tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa") #Write the temporary reference file. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) #For each cigar string for exonerateCigarString, (querySequenceName, querySequence) in \ zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)): fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cPecanRealign loadHmm = nameValue("loadHmm", options.hmmFile) try: command = "echo %s | cPecanRealign %s %s --diagonalExpansion=10 \ --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, options.gapGamma, options.matchGamma, outputCigarFile); system(command) # target.logToMaster('[good] ' + command + '\n'); except Exception, e: target.logToMaster('Caught an exception! qname = "%s"\n' % querySequenceName); target.logToMaster('len(exonerateCigarString[:-1]) = %d\n' % (len(exonerateCigarString[:-1]))); target.logToMaster('[bad] Command that caused the exception:\n'); target.logToMaster("echo %s | cPecanRealign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, options.gapGamma, options.matchGamma, outputCigarFile)); target.logToMaster('\n'); target.logToMaster('\n'); target.logToMaster(str(e) + '\n'); target.logToMaster('\n'); continue;
def run(self): os.chdir(self.directory) if self.paramFile is not None: cmd = "%s %s -b \"%s\" -t %s -s same -n %s -i %s -e %s -d %s -p %s %s >& jt.err" % (sys.executable, prepareExec, self.disc, self.paramFile, self.nullBatches, self.inferSpec, self.paradigmExec, self.dogmaLib, self.pathwayLib, self.evidSpec) else: cmd = "%s %s -b \"%s\" -s same -n %s -i %s -e %s -d %s -p %s %s >& jt.err" % (sys.executable, prepareExec, self.disc, self.nullBatches, self.inferSpec, self.paradigmExec, self.dogmaLib, self.pathwayLib, self.evidSpec) system(cmd) self.setFollowOnTarget(jtParadigm(self.em, self.directory))
def run(self, globalAlignment=False): AbstractAnalysis.run(self) #Call base method to do some logging refSequences = getFastaDictionary( self.referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary( self.readFastqFile) #Hash of names to sequences sam = pysam.Samfile(self.samFile, "r") readsToReadCoverages = {} for aR in samIterator(sam): #Iterate on the sam lines refSeq = refSequences[sam.getrname(aR.rname)] readSeq = readSequences[aR.qname] readAlignmentCoverageCounter = ReadAlignmentCoverageCounter( aR.qname, readSeq, sam.getrname(aR.rname), refSeq, aR, globalAlignment) if aR.qname not in readsToReadCoverages: readsToReadCoverages[aR.qname] = [] readsToReadCoverages[aR.qname].append(readAlignmentCoverageCounter) sam.close() #Write out the coverage info for differing subsets of the read alignments if len(readsToReadCoverages.values()) > 0: for readCoverages, outputName in [ (reduce(lambda x, y: x + y, readsToReadCoverages.values()), "coverage_all"), (map(lambda x: max(x, key=lambda y: y.readCoverage()), readsToReadCoverages.values()), "coverage_bestPerRead") ]: parentNode = getAggregateCoverageStats( readCoverages, outputName, refSequences, readSequences, readsToReadCoverages, outputName) open(os.path.join(self.outputDir, outputName + ".xml"), 'w').write(prettyXml(parentNode)) #this is a ugly file format with each line being a different data type - column length is variable outf = open(os.path.join(self.outputDir, outputName + ".txt"), "w") outf.write("MappedReadLengths " + parentNode.get("mappedReadLengths") + "\n") outf.write("UnmappedReadLengths " + parentNode.get("unmappedReadLengths") + "\n") outf.write("ReadCoverage " + parentNode.get("distributionreadCoverage") + "\n") outf.write( "MismatchesPerReadBase " + parentNode.get("distributionmismatchesPerReadBase") + "\n") outf.write("ReadIdentity " + parentNode.get("distributionidentity") + "\n") outf.write( "InsertionsPerBase " + parentNode.get("distributioninsertionsPerReadBase") + "\n") outf.write("DeletionsPerBase " + parentNode.get("distributiondeletionsPerReadBase") + "\n") outf.close() system( "Rscript nanopore/analyses/coverage_plot.R {} {}".format( os.path.join(self.outputDir, outputName + ".txt"), os.path.join(self.outputDir, outputName + ".pdf"))) self.finish()
def downloadQuery(fastqPath, tempDir, key, queryString, uuid): """ Downloads data from CGHub BAM Slicer """ system( """curl --silent "{}" -u "{}" | samtools bamshuf -Ou /dev/stdin {} | samtools bam2fq /dev/stdin > {}""".format( queryString, "haussler:" + key, os.path.join(tempDir, "tmp"), fastqPath)) if os.path.getsize(fastqPath) < 513: raise RuntimeError("curl did not download a BAM for {}. exiting.".format(uuid))
def run(self): counts = defaultdict(list) for d in self.dict_iter(): for x, y in d.iteritems(): counts[x].append(y) G = pickle.load(open(self.graph)) kmers = G.kmers added_counts = {} for k in kmers: added_counts[k] = sum(counts[k]) with open(os.path.join(self.out_dir, "bad_kmers.fasta"), "w") as outf: for k in kmers: if added_counts[k] == 0: G.G.edge[k + "_L"][k + "_R"]['bad'] = True del added_counts[k] outf.write(">{0}\n{0}\n".format(k)) filtered_kmers = sorted(added_counts.iterkeys()) with open(os.path.join(self.out_dir, "combined_counts.txt"), "w") as outf: for k in filtered_kmers: outf.write("{}\t{}\n".format(k, G.weights[k] * added_counts[k])) variances = {} for k in filtered_kmers: variances[k] = np.var(np.asarray(counts[k])) with open(os.path.join(self.out_dir, "variances.txt"), "w") as outf: for k in filtered_kmers: outf.write("{}\t{}\n".format(k, variances[k])) weights = {} for k in filtered_kmers: input_sequences = G.G.edge[k + "_L"][k + "_R"]['positions'].keys() weights[k] = 1.0 * len(self.count_files) * sum(avg_frac_dict[x] for x in input_sequences) / (added_counts[k] + 1) with open(os.path.join(self.out_dir, "weight_bad_kmers.fasta"), "w") as outf: for k in filtered_kmers: if weights[k] > 4.0 or weights[k] < 1.0: G.G.edge[k + "_L"][k + "_R"]['bad'] = True outf.write(">{}\n{}\n".format(weights[k], k)) with open(os.path.join(self.out_dir, "weights.txt"), "w") as outf: for k in weights: outf.write("{}\t{}\n".format(k, weights[k])) weights = {x:y for x,y in weights.iteritems() if y <= 4.0 and y >= 1.0} G.weightKmers(weights) with open(self.new_graph, "w") as outf: pickle.dump(G, outf) system("Rscript src/weights.R {} {} {} {} {}".format(os.path.join(self.out_dir, "combined_counts.txt"), os.path.join(self.out_dir, "weights.txt"), os.path.join(self.out_dir, "variances.txt"), len(self.count_files), "weighting_metrics.pdf"))
def posteriorProbabilityCalculationTargetFn(target, exonerateCigarStringFile, referenceSequenceName, referenceSequence, querySequenceFile, outputPosteriorProbsFile, options): """Calculates the posterior probabilities of matches in a set of pairwise alignments between a reference sequence and a set of reads. """ #Temporary files tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa") #Write the temporary reference file. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) #Hash to store posterior probabilities in expectationsOfBasesAtEachPosition = {} #For each cigar string for exonerateCigarString, (querySequenceName, querySequence) in \ zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)): fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cPecanRealign tempPosteriorProbsFile = os.path.join(target.getLocalTempDir(), "posteriorProbs.txt") if options.noMargin: #When we don't marginialize we just run cPecanRealign to get the list of aligned pairs #This runtime should be very fast system("echo %s | cPecanRealign %s %s --diagonalExpansion=0 \ --splitMatrixBiggerThanThis=1 --rescoreOriginalAlignment --outputPosteriorProbs=%s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, tempPosteriorProbsFile)) else: system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \ --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, tempPosteriorProbsFile, options.alignmentModel)) #Now collate the reference position expectations for refPosition, queryPosition, posteriorProb in \ map(lambda x : map(float, x.split()), open(tempPosteriorProbsFile, 'r')): assert posteriorProb <= 1.01 assert posteriorProb >= 0.0 key = (referenceSequenceName, int(refPosition)) if key not in expectationsOfBasesAtEachPosition: expectationsOfBasesAtEachPosition[key] = dict( zip(BASES, [0.0] * len(BASES))) queryBase = querySequence[int(queryPosition)].upper() if queryBase in BASES: #Could be an N or other wildcard character, which we ignore expectationsOfBasesAtEachPosition[key][ queryBase] += 1.0 if options.noMargin else posteriorProb #Pickle the posterior probs fileHandle = open(outputPosteriorProbsFile, 'w') cPickle.dump(expectationsOfBasesAtEachPosition, fileHandle, cPickle.HIGHEST_PROTOCOL) fileHandle.close()
def download_query(fastq_tmp_path, tmp_dir, key_file, query_string, uuid): """ Downloads data from CGHub BAM Slicer """ key = open(key_file).readline().rstrip() system("""curl --silent "{}" -u "{}" | samtools bamshuf -Ou - {} | samtools bam2fq - > {}""".format( query_string, "haussler:" + key, os.path.join(tmp_dir, "tmp"), fastq_tmp_path)) os.remove(os.path.join(tmp_dir, "tmp")) if os.path.getsize(fastq_tmp_path) < 513: raise RuntimeError("curl did not download a BAM for {}. exiting.".format(uuid))
def run(self): os.chdir(self.directory) evidList = zip(re.split("\s", self.evidSpec)[0::2], re.split("\s", self.evidSpec)[1::2]) ## assert files are in data/ for i in evidList: assert(re.split(":", i[1])[1].startswith("data")) ## check if new run if not os.path.exists("fold1"): ## find sample overlap dataSamples = None for i in evidList: if i[1].startswith("rawFile"): if dataSamples is None: dataSamples = set(retRows(re.split(":", i[1])[1])) else: dataSamples = dataSamples & set(retRows(re.split(":", i[1])[1])) else: if dataSamples is None: dataSamples = set(retColumns(re.split(":", i[1])[1])) else: dataSamples = dataSamples & set(retColumns(re.split(":", i[1])[1])) dataSamples = list(dataSamples) ## pick samples foldSamples = {} for f in range(1, self.mFolds+1): foldSamples[f] = [] selectSamples = deepcopy(dataSamples) while len(selectSamples) > 0: for f in range(1, mFolds+1): if len(selectSamples) > 0: foldSamples[f].append(selectSamples.pop(random.randint(0,len(selectSamples)-1))) ## create directories and data for f in range(1, self.mFolds+1): system("mkdir fold%s" % (f)) system("mkdir fold%s/train" % (f)) system("mkdir fold%s/train/data" % (f)) system("mkdir fold%s/test" % (f)) system("mkdir fold%s/test/data" % (f)) trainSamples = list(set(dataSamples) - set(foldSamples[f])) testSamples = foldSamples[f] for i in evidList: if i[1].startswith("rawFile"): rwCRSData("fold%s/train/%s" % (f, re.split(":", i[1])[1]), re.split(":", i[1])[1], useRows = trainSamples) rwCRSData("fold%s/test/%s" % (f, re.split(":", i[1])[1]), re.split(":", i[1])[1], useRows = testSamples) else: rwCRSData("fold%s/train/%s" % (f, re.split(":", i[1])[1]), re.split(":", i[1])[1], useCols = trainSamples) rwCRSData("fold%s/test/%s" % (f, re.split(":", i[1])[1]), re.split(":", i[1])[1], useCols = testSamples) ## kick off runs for f in range(1, self.mFolds+1): self.addChildTarget(branchTrain(self.evidSpec, self.disc, self.paramFile, self.paradigmExec, self.inferSpec, self.dogmaLib, self.pathwayLib, self.shuffleNode, self.nShuffle, "%s/fold%s" % (self.directory, f)))
def testSort(self): for test in xrange(self.testNo): tempDir = getTempDirectory(os.getcwd()) tempFile1 = getTempFile(rootDir=tempDir) makeFileToSort(tempFile1) lines1 = loadFile(tempFile1) lines1.sort() sort(tempFile1) lines2 = loadFile(tempFile1) checkEqual(lines1, lines2) system("rm -rf %s" % tempDir)
def run(self): os.chdir(self.directory) private_arg = "" if self.private_paradigm: private_arg = "-z" if self.paramFile is not None: cmd = "prepareParadigm.py -b \"%s\" -t %s -s same -n %s -i %s -e %s -d %s -p %s %s %s >& jt.err" % (self.disc, self.paramFile, self.nullBatches, self.inferSpec, self.paradigmExec, self.dogmaLib, self.pathwayLib, private_arg, self.evidSpec) else: cmd = "prepareParadigm.py -b \"%s\" -s same -n %s -i %s -e %s -d %s -p %s %s %s >& jt.err" % (self.disc, self.nullBatches, self.inferSpec, self.paradigmExec, self.dogmaLib, self.pathwayLib, private_arg, self.evidSpec) system(cmd) self.setFollowOnTarget(jtParadigm(self.em, self.directory))
def run(self, kmerSize=5): self.kmerSize = kmerSize for readType in self.readTypes: mappedKmers, unmappedKmers = Counter(), Counter() for read in self.reads: if read.readType == readType and read.is_mapped: mappedKmers += self.countKmers(read.seq) elif read.readType == readType: unmappedKmers += self.countKmers(read.seq) mappedSize, unmappedSize = sum(mappedKmers.values()), sum( unmappedKmers.values()) outf = open( os.path.join(self.getLocalTempDir(), readType + "_kmer_counts.txt"), "w") outf.write( "kmer\tmappableCount\tmappableFraction\tunmappableCount\tunmappableFraction\tlogFoldChange\n" ) for kmer in itertools.product("ATGC", repeat=5): kmer = "".join(kmer) if mappedSize > 0: mappedFraction = 1.0 * mappedKmers[kmer] / mappedSize else: mappedFraction = 0 if unmappedSize > 0: unmappedFraction = 1.0 * unmappedKmers[kmer] / unmappedSize else: unmappedFraction = 0 if unmappedFraction == 0: foldChange = "-Inf" elif mappedFraction == 0: foldChange = "Inf" else: foldChange = -log(mappedFraction / unmappedFraction) outf.write("\t".join( map(str, [ kmer, mappedKmers[kmer], mappedFraction, unmappedKmers[kmer], unmappedFraction, foldChange ])) + "\n") outf.close() system( "Rscript nanopore/metaAnalyses/mappable_kmer_analysis.R {} {} {} {}" .format( os.path.join(self.getLocalTempDir(), readType + "_kmer_counts.txt"), os.path.join(self.outputDir, readType + "_unmapped_kmer_counts.txt"), os.path.join( self.outputDir, readType + "_unmapped_top_bot_sigkmer_counts.txt"), os.path.join(self.outputDir, readType + "_volcano_plot.pdf")))
def testGetMidPoint(self): for test in xrange(self.testNo): tempDir = getTempDirectory(os.getcwd()) tempFile = getTempFile(rootDir=tempDir) makeFileToSort(tempFile) l = open(tempFile, 'r').read() fileSize = os.path.getsize(tempFile) midPoint = getMidPoint(tempFile, 0, fileSize) print "the mid point is %i of a file of %i bytes woth byte" % (midPoint, fileSize) assert midPoint < fileSize assert l[midPoint] == '\n' assert midPoint >= 0 system("rm -rf %s" % tempDir)
def runJobTree(command, jobTreeDir, logLevel="DEBUG", retryCount=0, batchSystem="single_machine", rescueJobFrequency=None): """A convenience function for running job tree from within a python script. """ if rescueJobFrequency != None: rescueJobFrequencyString = "--rescueJobsFrequency %s" % float(rescueJobFrequency) else: rescueJobFrequencyString = "" command = "jobTree --command \"%s\" --jobTree %s --logLevel %s \ --retryCount %i --batchSystem %s %s" % \ (command, jobTreeDir, logLevel, retryCount, batchSystem, rescueJobFrequencyString) logger.info("Running command : %s" % command) system(command) logger.info("Ran the jobtree apparently okay")
def run(self): os.chdir(self.directory) if paradigmPublic: for b in range(len(self.dataSamples)): system("cat outputFiles/%s_upstream_b%s_%s.fa >> %s_upstream.fa" % (self.mutatedGene, b, len(self.dataSamples), self.mutatedGene)) system("cat outputFiles/%s_downstream_b%s_%s.fa >> %s_downstream.fa" % (self.mutatedGene, b, len(self.dataSamples), self.mutatedGene)) for null in range(1, nNulls+1): system("cat outputFiles/N%s_%s_upstream_b%s_%s.fa >> N%s_%s_upstream.fa" % (null, self.mutatedGene, b, len(self.dataSamples), null, self.mutatedGene)) system("cat outputFiles/N%s_%s_downstream_b%s_%s.fa >> N%s_%s_downstream.fa" % (null, self.mutatedGene, b, len(self.dataSamples), null, self.mutatedGene)) system("rm -rf outputFiles") shiftCV(self.mutatedGene, self.mutatedSamples, self.dataSamples, self.trainSamples, self.uPathway, self.dPathway, nNulls = nNulls)
def posteriorProbabilityCalculationTargetFn(target, exonerateCigarStringFile, referenceSequenceName, referenceSequence, querySequenceFile, outputPosteriorProbsFile, options): """Calculates the posterior probabilities of matches in a set of pairwise alignments between a reference sequence and a set of reads. """ #Temporary files tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa") #Write the temporary reference file. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) #Hash to store posterior probabilities in expectationsOfBasesAtEachPosition = {} #For each cigar string for exonerateCigarString, (querySequenceName, querySequence) in \ zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)): fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cPecanRealign tempPosteriorProbsFile = os.path.join(target.getLocalTempDir(), "posteriorProbs.txt") if options.noMargin: #When we don't marginialize we just run cPecanRealign to get the list of aligned pairs #This runtime should be very fast system("echo %s | cPecanRealign %s %s --diagonalExpansion=0 \ --splitMatrixBiggerThanThis=1 --rescoreOriginalAlignment --outputPosteriorProbs=%s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, tempPosteriorProbsFile)) else: system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \ --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, tempPosteriorProbsFile, options.alignmentModel)) #Now collate the reference position expectations for refPosition, queryPosition, posteriorProb in \ map(lambda x : map(float, x.split()), open(tempPosteriorProbsFile, 'r')): assert posteriorProb <= 1.01 assert posteriorProb >= 0.0 key = (referenceSequenceName, int(refPosition)) if key not in expectationsOfBasesAtEachPosition: expectationsOfBasesAtEachPosition[key] = dict(zip(BASES, [0.0]*len(BASES))) queryBase = querySequence[int(queryPosition)].upper() if queryBase in BASES: #Could be an N or other wildcard character, which we ignore expectationsOfBasesAtEachPosition[key][queryBase] += 1.0 if options.noMargin else posteriorProb #Pickle the posterior probs fileHandle = open(outputPosteriorProbsFile, 'w') cPickle.dump(expectationsOfBasesAtEachPosition, fileHandle, cPickle.HIGHEST_PROTOCOL) fileHandle.close()
def run(self): os.chdir(self.directory) ## cytoscape-web for mutatedGene in self.includeFeatures: if os.path.exists("analysis/%s/sig.tab" % (mutatedGene)): tableFiles = [] tableFiles.append("analysis/%s/sig.tab" % (mutatedGene)) tableFiles.append("msepPlot:analysis/%s/%s.msep.pdf" % (mutatedGene, mutatedGene)) tableFiles.append("backgroundPlot:analysis/%s/%s.background.pdf" % (mutatedGene, mutatedGene)) tableFiles.append("analysis/%s/avgAUC.tab" % (mutatedGene)) tableFiles.append("analysis/%s/pshift.tab" % (mutatedGene)) system("pathmark-report.py -t %s analysis/%s %s" % (",".join(tableFiles), mutatedGene, self.reportDir)) system("cp analysis/%s/pshift* %s" % (mutatedGene, self.reportDir))
def realignCigarTargetFn(target, exonerateCigarString, referenceSequenceName, referenceSequence, querySequenceName, querySequence, outputCigarFile, hmmFile, gapGamma, matchGamma): #Temporary files tempRefFile = os.path.join(target.getGlobalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getGlobalTempDir(), "read.fa") #Write the temporary files. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cactus_realign loadHmm = nameValue("loadHmm", hmmFile) system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s > %s" % (exonerateCigarString, tempRefFile, tempReadFile, loadHmm, gapGamma, matchGamma, outputCigarFile)) assert len([ pA for pA in cigarRead(open(outputCigarFile)) ]) > 0 assert len([ pA for pA in cigarRead(open(outputCigarFile)) ]) == 1
def run(self): while True: command, logFile, jobID = self.inputQueue.get() #fnull = open(os.devnull, 'w') #Pipe the output to dev/null (it is caught by the slave and will be reported if there is an error) tempLogFile = getTempFile() fileHandle = open(tempLogFile, 'w') process = subprocess.Popen(command, shell=True, stdout = fileHandle, stderr = fileHandle) sts = os.waitpid(process.pid, 0) fileHandle.close() #fnull.close() if os.path.exists(tempLogFile): system("mv %s %s" % (tempLogFile, logFile)) self.outputQueue.put((command, sts[1], jobID)) self.inputQueue.task_done()
def testCopySubRangeOfFile(self): for test in xrange(self.testNo): tempDir = getTempDirectory(os.getcwd()) tempFile = getTempFile(rootDir=tempDir) outputFile = getTempFile(rootDir=tempDir) makeFileToSort(tempFile) fileSize = os.path.getsize(tempFile) assert fileSize > 0 fileStart = random.choice(xrange(0, fileSize)) fileEnd = random.choice(xrange(fileStart, fileSize)) copySubRangeOfFile(tempFile, fileStart, fileEnd, outputFile) l = open(outputFile, 'r').read() l2 = open(tempFile, 'r').read()[fileStart:fileEnd] checkEqual(l, l2) system("rm -rf %s" % tempDir)
def run(self): os.chdir(self.cwd) cmd = "%s -p outputFilesEM/*learn* " % collectParamsExec if (os.path.exists("mask.expectations")): cmd += " mask.expectations " cmd += "| %s -o params%i.txt /dev/stdin " \ % (collectParamsExec, self.iteration + 1) if (os.path.exists("mask.params")): cmd += " mask.params " system(cmd) if self.emHasTerminated(): self.setFollowOnTarget(FinalRun(self.iteration + 1, self.cwd)) else: self.setFollowOnTarget(ExpectationIteration(self.iteration + 1, self.tolerance, self.cwd))
def parasolIsInstalled(): """Returns True if parasol is installed, else False. """ try: return system("parasol status") == 0 except RuntimeError: return False
def run(self): os.chdir(self.cwd) cmd = "%s -p outputFilesEM/*learn* " % collectParamsExec if (os.path.exists("mask.expectations")): cmd += " mask.expectations " cmd += "| %s -o params%i.txt /dev/stdin " \ % (collectParamsExec, self.iteration + 1) if (os.path.exists("mask.params")): cmd += " mask.params " system(cmd) if self.emHasTerminated(): self.setFollowOnTarget(FinalRun(self.iteration + 1, self.cwd)) else: self.setFollowOnTarget( ExpectationIteration(self.iteration + 1, self.tolerance, self.cwd))
def run(self): os.chdir(self.cwd) system("rm -f params.txt") system("ln -s params%i.txt params.txt" % self.iteration) system("mkdir -p outputFilesEM%i" % self.iteration) system("rm -f outputFilesEM") system("ln -s outputFilesEM%i outputFilesEM" % self.iteration) sys.stderr.write("Current directory: " + os.getcwd() + "\n") jfile = open("jobsEM.list", "r") for job in jfile: self.addChildTarget(ParadigmCmd(job, self.cwd)) jfile.close() self.setFollowOnTarget( MaximizationIteration(self.iteration, self.tolerance, self.cwd))
def gridEngineIsInstalled(): """Returns True if grid-engine is installed, else False. """ try: return system("qstat -help") == 0 except RuntimeError: return False
def run(self): os.chdir(self.cwd) system("rm -f params.txt") system("ln -s params%i.txt params.txt" % self.iteration) system("mkdir -p outputFilesEM%i" % self.iteration) system("rm -f outputFilesEM") system("ln -s outputFilesEM%i outputFilesEM" % self.iteration) sys.stderr.write("Current directory: " + os.getcwd() + "\n") jfile = open("jobsEM.list", "r") for job in jfile: self.addChildTarget(ParadigmCmd(job, self.cwd)) jfile.close() self.setFollowOnTarget(MaximizationIteration(self.iteration, self.tolerance, self.cwd))
def run(self): for readType in self.readTypes: sortedBaseMappers = [x for x in sorted(self.baseMappers) if x != "Combined"] outf = open(os.path.join(self.outputDir, readType + "_perReadMappability.tsv"), "w") outf.write("Read\tReadFastqFile\t"); outf.write("\t".join(sortedBaseMappers)); outf.write("\n") for read in self.reads: if read.readType == readType: tmp = od([[x, 0] for x in sortedBaseMappers]) if read.is_mapped is True: for mapper, reference in read.get_map_ref_pair(): baseMapper = re.findall("[A-Z][a-z]*", mapper)[0] #hacky way to avoid including 'combined' analysis if baseMapper != "Combined" and tmp[baseMapper] == 0: tmp[baseMapper] = 1 outf.write("\t".join([read.name, os.path.basename(read.readFastqFile)] + map(str, tmp.values()))); outf.write("\n") outf.close() system("Rscript nanopore/metaAnalyses/vennDiagram.R {} {}".format(os.path.join(self.outputDir, readType + "_perReadMappability.tsv"), os.path.join(self.outputDir, readType + "_perReadMappabilityVennDiagram.pdf")))
def run(self): os.chdir(self.directory) ## branch genes htmlFeatures = [] if not os.path.exists("analysis"): system("mkdir analysis") for mutatedGene in self.mutationMap.keys(): if not os.path.exists("analysis/%s" % (mutatedGene)): system("mkdir analysis/%s" % (mutatedGene)) htmlFeatures.append(mutatedGene) self.addChildTarget(branchFolds(mutatedGene, self.mutationMap[mutatedGene], self.dataSamples, self.dataFeatures, self.dataMap, self.gPathway, self.paradigmDir, self.paramMap, self.foldMap, self.directory)) if os.path.exists(htmlDir): self.setFollowOnTarget(pshiftReport(htmlFeatures, "%s/%s" % (htmlDir, self.paramMap["cohortName"]), self.directory))
def analyzeCounts(self, refKmers, readKmers, name): refSize, readSize = sum(refKmers.values()), sum(readKmers.values()) outf = open(os.path.join(self.outputDir, name + "kmer_counts.txt"), "w") outf.write("kmer\trefCount\trefFraction\treadCount\treadFraction\tlogFoldChange\n") if refSize > 0 and readSize > 0: for kmer in itertools.product("ATGC", repeat=5): refFraction, readFraction = 1.0 * refKmers[kmer] / refSize, 1.0 * readKmers[kmer] / readSize if refFraction == 0: foldChange = "-Inf" elif readFraction == 0: foldChange = "Inf" else: foldChange = -log(readFraction / refFraction) outf.write("\t".join(map(str,["".join(kmer), refKmers[kmer], refFraction, readKmers[kmer], readFraction, foldChange]))+"\n") outf.close() system("Rscript nanopore/analyses/kmer_analysis.R {} {} {} {} {}".format(os.path.join(self.outputDir, name + "kmer_counts.txt"), os.path.join(self.outputDir, name + "pval_kmer_counts.txt"), os.path.join(self.outputDir, name + "top_bot_sigkmer_counts.txt"), os.path.join(self.outputDir, name + "volcano_plot.pdf"), "Indel_Kmer"))
def run(self): AbstractAnalysis.run(self) readSequences = getFastqDictionary(self.readFastqFile) nr = re.compile(r"channel_[0-9]+_read_[0-9]+") per_channel_read_counts = Counter([int(x.split("_")[1]) for x in readSequences.iterkeys() if re.match(nr, x)]) sam = pysam.Samfile(self.samFile, "r") mapped_read_counts = Counter([int(aR.qname.split("_")[1]) for aR in samIterator(sam) if re.match(nr, aR.qname) and aR.is_unmapped is False]) if len(mapped_read_counts) > 0 and len(per_channel_read_counts) > 0: outf = open(os.path.join(self.outputDir, "channel_mappability.tsv"), "w") outf.write("Channel\tReadCount\tMappableReadCount\n") max_channel = max(513, max(per_channel_read_counts.keys())) #in case there are more than 512 in the future for channel in xrange(1, max_channel): outf.write("\t".join(map(str, [channel, per_channel_read_counts[channel], mapped_read_counts[channel]]))) outf.write("\n") outf.close() system("Rscript nanopore/analyses/channel_plots.R {} {} {} {} {}".format(os.path.join(self.outputDir, "channel_mappability.tsv"), os.path.join(self.outputDir, "channel_mappability.pdf"), os.path.join(self.outputDir, "channel_mappability_sorted.png"), os.path.join(self.outputDir, "mappability_levelplot.png"), os.path.join(self.outputDir, "mappability_leveplot_percent.png"))) self.finish()
def align_query(target, fastq, bam, uuid, index): """ Aligns the extracted reads to the notch locus, filtering for unmapped reads and creating a custom reheadered BAM. """ # align the extracted reads to the index tmp = os.path.join(target.getLocalTempDir(), "tmp") sorted_bam = os.path.join(target.getLocalTempDir(), "{}.sorted.bam".format(uuid)) system("bwa mem -v 1 {} {} | samtools view -F 4 -bS - | samtools sort -O bam -T {} - > {}".format(index, fastq, tmp, sorted_bam)) header = {"HD": {"VN": "1.3"}, "SQ": [{"LN": 248956422, "SN": "chr1"}]} outfile = pysam.Samfile(bam, "wb", header=header) bamfile = pysam.Samfile(sorted_bam, "rb") for record in bamfile: chrom, span = bamfile.getrname(record.tid).split(":") start, end = map(int, span.split("-")) record.pos = record.pos + start - 1 outfile.write(record) outfile.close() system("samtools index {}".format(bam))
def realignCigarTargetFn(target, exonerateCigarString, referenceSequenceName, referenceSequence, querySequenceName, querySequence, outputCigarFile, hmmFile, gapGamma, matchGamma): #Temporary files tempRefFile = os.path.join(target.getGlobalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getGlobalTempDir(), "read.fa") #Write the temporary files. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cactus_realign loadHmm = nameValue("loadHmm", hmmFile) system( "echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s > %s" % (exonerateCigarString, tempRefFile, tempReadFile, loadHmm, gapGamma, matchGamma, outputCigarFile)) assert len([pA for pA in cigarRead(open(outputCigarFile))]) > 0 assert len([pA for pA in cigarRead(open(outputCigarFile))]) == 1
def write_file_analyze(self, entries, name, multiple_read_types=False): path = os.path.join(self.outputDir, name + ".csv") outf = open(path, "w") outf.write(",".join([ "Name", "Mapper", "ReadType", "ReadFile", "ReferenceFile", "AvgReadCoverage", "AvgReferenceCoverage", "AvgIdentity", "AvgMismatchesPerReadBase", "AvgDeletionsPerReadBase", "AvgInsertionsPerReadBase", "NumberOfMappedReads", "NumberOfUnmappedReads", "NumberOfReads" ])) outf.write("\n") entries = sorted(entries, key=lambda x: (x.mapper, x.readType, x.readFastqFile)) names = self.resolve_duplicate_rownames(entries, multiple_read_types) for entry, n in izip(entries, names): outf.write(",".join([ n, entry.mapper, entry.readType, entry.readFastqFile, entry.referenceFastaFile, entry.XML.attrib["avgreadCoverage"], entry.XML.attrib["avgreferenceCoverage"], entry.XML.attrib["avgidentity"], entry.XML.attrib["avgmismatchesPerReadBase"], entry.XML.attrib["avgdeletionsPerReadBase"], entry.XML.attrib["avginsertionsPerReadBase"], entry.XML.attrib["numberOfMappedReads"], entry.XML.attrib["numberOfUnmappedReads"], entry.XML.attrib["numberOfReads"] ]) + "\n") outf.close() path2 = os.path.join(self.outputDir, name + "_distribution.csv") outf = open(path2, "w") for entry, n in izip(entries, names): outf.write( ",".join([n] + entry.XML.attrib["distributionidentity"].split())) outf.write("\n") outf.close() system("Rscript nanopore/metaAnalyses/coverageSummaryPlots.R {} {} {}". format( path, name, os.path.join(self.outputDir, name + "_summary_plots.pdf"))) system("Rscript nanopore/metaAnalyses/coveragePlots.R {} {} {}".format( path2, name, os.path.join(self.outputDir, name + "_distribution.pdf")))
def writeScripts(): """creates the R scripts necessary for plotting""" backgroundR = """#!/usr/bin/env Rscript args = commandArgs(TRUE) phenotype = args[1] Real = read.table(paste("stats_", phenotype, ".tab", sep=""), header=TRUE) Nulls = read.table(paste("stats_NULL_", phenotype, ".tab", sep=""), header=TRUE) nbreaks = 60 zscore = c(as.character((Real$totNodes-mean(Nulls$totNodes))/sd(Nulls$totNodes)), as.character((Real$totLinks-mean(Nulls$totLinks))/sd(Nulls$totLinks)), as.character((Real$largest_netNodes-mean(Nulls$largest_netNodes))/sd(Nulls$largest_netNodes)), as.character((Real$largest_netLinks-mean(Nulls$largest_netLinks))/sd(Nulls$largest_netLinks))) fileConn = file(paste(phenotype, ".stats", sep="")) writeLines(zscore, fileConn) close(fileConn) xrange = c(min(Nulls$totNodes, Real$totNodes)-50, max(Nulls$totNodes, Real$totNodes)+50) png(paste(phenotype, "_total_netNodes.png", sep=""), heigh=720, width=1280) hist(Nulls$totNodes, breaks=nbreaks, xlim=xrange, xlab="Number", main=paste("Number of Nodes for Subnet, z = ", zscore[1], sep="")) abline(v = Real$totNodes, col="red", lty = 2) dev.off() xrange = c(min(Nulls$totLinks, Real$totLinks)-50, max(Nulls$totLinks, Real$totLinks)+50) png(paste(phenotype, "_total_netLinks.png", sep=""), heigh=720, width=1280) hist(Nulls$totLinks, breaks=nbreaks, xlim=xrange, xlab="Number", main=paste("Number of Links for Subnet, z = ", zscore[2], sep="")) abline(v = Real$totLinks, col="red", lty = 2) dev.off() xrange = c(min(Nulls$largest_netNodes, Real$largest_netNodes)-50, max(Nulls$largest_netNodes, Real$largest_netNodes)+50) png(paste(phenotype, "_largest_netNodes.png", sep=""), heigh=720, width=1280) hist(Nulls$largest_netNodes, breaks=nbreaks, xlim=xrange, xlab="Number", main=paste("Number of Nodes for Largest Component, z = ", zscore[3], sep="")) abline(v = Real$largest_netNodes, col="red", lty = 2) dev.off() xrange = c(min(Nulls$largest_netLinks, Real$largest_netLinks)-50, max(Nulls$largest_netLinks, Real$largest_netLinks)+50) png(paste(phenotype, "_largest_netLinks.png", sep=""), heigh=720, width=1280) hist(Nulls$largest_netLinks, breaks=nbreaks, xlim=xrange, xlab="Number", main=paste("Number of Links for Largest Component, z = ", zscore[4], sep="")) abline(v = Real$largest_netLinks, col="red", lty = 2) dev.off() """ f = open("background.R", "w") f.write(backgroundR) f.close system("chmod 755 *.R")
def alignQuery(fastqPath, remappedBamPath, tempDir, uuid, index): """ Aligns to the notch locus """ # align the extracted reads to the index sortedBamPath = os.path.join(tempDir, "{}.sorted".format(uuid)) system("bwa mem -v 1 {} {} | samtools view -F 4 -bS - | samtools sort - {}".format(index, fastqPath, sortedBamPath)) # samtools appends .bam to sorted bam files sortedBamPath += ".bam" header = {"HD": {"VN": "1.3"}, "SQ": [{"LN": 248956422, "SN": "chr1"}]} outfile = pysam.Samfile(remappedBamPath, "wb", header=header) bamfile = pysam.Samfile(sortedBamPath, "rb") for record in bamfile: chrom, span = bamfile.getrname(record.tid).split(":") start, end = map(int, span.split("-")) record.pos = record.pos + start - 1 outfile.write(record) outfile.close() system("samtools index {}".format(remappedBamPath))
def run_jellyfish(target, jf_counts, k_plus1_mer_counts, fastq, uuid, kmer_size): """ Runs jellyfish twice: the first time counts kmers with -C at kmer_size kmers. This is the raw data for the ILP model. Runs jellyfish a second time, with kmer_size +1 and the bloom filter which removes most kmers with counts of oe. This will be used to add individual nodes to the graph. """ jf_file = os.path.join(target.getLocalTempDir(), uuid + ".jf") system("jellyfish count -C -m {} -s 200M -o {} {}".format(kmer_size, jf_file, fastq)) system("jellyfish dump {} > {}".format(jf_file, jf_counts)) os.remove(jf_file) system("jellyfish count -C -m {} --bf-size 1G -s 200M -o {} {}".format(kmer_size + 1, jf_file, fastq)) system("jellyfish dump {} > {}".format(jf_file, k_plus1_mer_counts)) os.remove(jf_file)
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, referenceSequence, querySequenceFile, outputCigarFile, options): #Temporary files tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa") #Write the temporary reference file. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) #For each cigar string for exonerateCigarString, (querySequenceName, querySequence) in \ zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)): fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cPecanRealign loadHmm = nameValue("loadHmm", options.hmmFile) system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \ --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, options.gapGamma, options.matchGamma, outputCigarFile))
def delete(self): """Removes from disk atomically, can not then subsequently call read(), write() or addChildren() """ os.remove(self.getJobFileName()) #This is the atomic operation, if this file is not present the job is deleted. dirToRemove = self.jobDir while 1: head, tail = os.path.split(dirToRemove) if re.match("t[0-9]+$", tail): command = "rm -rf %s" % dirToRemove else: command = "rm -rf %s/*" % dirToRemove #We're at the root try: system(command) except RuntimeError: pass #This is not a big deal, as we expect collisions dirToRemove = head try: if len(os.listdir(dirToRemove)) != 0: break except os.error: #In case stuff went wrong, but as this is not critical we let it slide break
def run(self): AbstractAnalysis.run(self) #Call base method to do some logging refSequences = getFastaDictionary( self.referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary( self.readFastqFile) #Hash of names to sequences sam = pysam.Samfile(self.samFile, "r") indelCounters = map(lambda aR: IndelCounter( sam.getrname(aR.rname), refSequences[sam.getrname( aR.rname)], aR.qname, readSequences[aR.qname], aR), samIterator(sam)) #Iterate on the sam lines sam.close() #Write out the substitution info if len(indelCounters) > 0: indelXML = getAggregateIndelStats(indelCounters) open(os.path.join(self.outputDir, "indels.xml"), "w").write(prettyXml(indelXML)) tmp = open(os.path.join(self.outputDir, "indels.tsv"), "w") #build list of data as vectors data_list = [] var = [ "readInsertionLengths", "readDeletionLengths", "ReadSequenceLengths", "NumberReadInsertions", "NumberReadDeletions", "MedianReadInsertionLengths", "MedianReadDeletionLengths" ] for x in var: data_list.append([x] + indelXML.attrib[x].split()) #transpose this list so R doesn't take hours to load it using magic data_list = map(None, *data_list) for line in data_list: tmp.write("\t".join(map(str, line))) tmp.write("\n") tmp.close() system("Rscript nanopore/analyses/indelPlots.R {} {}".format( os.path.join(self.outputDir, "indels.tsv"), os.path.join(self.outputDir, "indel_plots.pdf"))) self.finish() #Indicates the batch is done
def run(self, kmer=5): AbstractAnalysis.run(self) #Call base method to do some logging refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences sM = SubstitutionMatrix() #The thing to store the counts in sam = pysam.Samfile(self.samFile, "r" ) for aR in samIterator(sam): #Iterate on the sam lines for aP in AlignedPair.iterator(aR, refSequences[sam.getrname(aR.rname)], readSequences[aR.qname]): #Walk through the matches mismatches: sM.addAlignedPair(aP.getRefBase(), aP.getReadBase()) sam.close() #Write out the substitution info open(os.path.join(self.outputDir, "substitutions.xml"), 'w').write(prettyXml(sM.getXML())) bases = "ACGT" outf = open(os.path.join(self.outputDir, "subst.tsv"), "w") outf.write("A\tC\tG\tT\n") for x in bases: freqs = sM.getFreqs(x, bases) outf.write("{}\t{}\n".format(x, "\t".join(map(str,freqs)), "\n")) outf.close() analysis = self.outputDir.split("/")[-2].split("_")[-1] + "_Substitution_Levels" system("Rscript nanopore/analyses/substitution_plot.R {} {} {}".format(os.path.join(self.outputDir, "subst.tsv"), os.path.join(self.outputDir, "substitution_plot.pdf"), analysis)) self.finish()
def run(self): AbstractAnalysis.run(self) readSequences = getFastqDictionary(self.readFastqFile) nr = re.compile(r"channel_[0-9]+_read_[0-9]+") per_channel_read_counts = Counter([ int(x.split("_")[1]) for x in readSequences.iterkeys() if re.match(nr, x) ]) sam = pysam.Samfile(self.samFile, "r") mapped_read_counts = Counter([ int(aR.qname.split("_")[1]) for aR in samIterator(sam) if re.match(nr, aR.qname) and aR.is_unmapped is False ]) if len(mapped_read_counts) > 0 and len(per_channel_read_counts) > 0: outf = open( os.path.join(self.outputDir, "channel_mappability.tsv"), "w") outf.write("Channel\tReadCount\tMappableReadCount\n") max_channel = max(513, max(per_channel_read_counts.keys()) ) #in case there are more than 512 in the future for channel in xrange(1, max_channel): outf.write("\t".join( map(str, [ channel, per_channel_read_counts[channel], mapped_read_counts[channel] ]))) outf.write("\n") outf.close() system("Rscript nanopore/analyses/channel_plots.R {} {} {} {} {}". format( os.path.join(self.outputDir, "channel_mappability.tsv"), os.path.join(self.outputDir, "channel_mappability.pdf"), os.path.join(self.outputDir, "channel_mappability_sorted.png"), os.path.join(self.outputDir, "mappability_levelplot.png"), os.path.join(self.outputDir, "mappability_leveplot_percent.png"))) self.finish()
def run(self): os.chdir(self.directory) system( "rm -rf real* null* OCCAM__* background.R LAYOUT/*.params LAYOUT/real_results.* LAYOUT/null_results.* LAYOUT/*.tab LAYOUT/NULL_*" ) if self.outputZip is not None: system("zip -r LAYOUT.zip LAYOUT") system("mv -f LAYOUT.zip %s" % (self.outputZip))
def run(self): layoutDir = "%s/LAYOUT" % (self.directory) os.chdir(layoutDir) ## aggregate null scores if self.nNulls > 0: phenotypeName = re.split("/", self.phenotypeFile)[-1] if not os.path.exists("null_results.%s.tab" % (self.occamPhenotype)): nullScores = {} for null in range(1, self.nNulls + 1): if len( retColumns( "../OCCAM__%s__null_%s.tab/results.tab" % (phenotypeName, null))) == 0: ## this is an error right now continue nullScores["N%s" % (null)] = rCRSData( "../OCCAM__%s__null_%s.tab/results.tab" % (phenotypeName, null))[self.occamPhenotype] wCRSData("null_results.%s.tab" % (self.occamPhenotype), nullScores) ## run pathmark system("%s %s -l %s.params -b \"%s\" -f %s -n real_results.all.tab" % (sys.executable, pathmarkExec, self.occamPhenotype, self.filterParams, self.occamPhenotype)) if self.nNulls > 0: system( "%s %s -b \"%s\" -s %s.params -d NULL_%s null_results.%s.tab" % (sys.executable, pathmarkExec, self.filterParams, self.occamPhenotype, self.occamPhenotype, self.occamPhenotype)) self.setFollowOnTarget( backgroundPATHMARK(self.occamPhenotype, self.nNulls, self.directory))
def run(self): os.chdir(self.cwd) system("rm -f params.txt") system("ln -s params%i.txt params.txt" % self.iteration) system("mkdir -p outputFiles") jfile = open("jobs.list", "r") for job in jfile: self.addChildTarget(ParadigmCmd(job, self.cwd)) jfile.close() self.setFollowOnTarget(Merge(self.cwd))
def run(self): layoutDir = "%s/LAYOUT" % (self.directory) os.chdir(layoutDir) system("ls %s/*_nodrug.sif | %s %s > stats_%s.tab" % (self.occamPhenotype, sys.executable, statisticsExec, self.occamPhenotype)) system( "ls NULL_%s/*_nodrug.sif | %s %s -c counts_NULL_%s.tab > stats_NULL_%s.tab" % (self.occamPhenotype, sys.executable, statisticsExec, self.occamPhenotype, self.occamPhenotype)) system("../background.R %s" % (self.occamPhenotype))