def preprocess(self, lib): """ The method preprocess determines which preprocessing steps have to be executed for a given library. """ logging.info("Preprocessing of " + lib.libName) LaTeX.ltxSection("Preprocessing of " + lib.libName) lib.forward = DirUtils.fileRegexToList(lib.forward) lib.reversed = DirUtils.fileRegexToList(lib.reversed) if lib.format == "sff": for idx, sffFile in enumerate(lib.forward): lib.forward[idx] = SffToFastqConverter.SffToFastqConverter(lib.outputDir, sffFile=sffFile).execute() smallReport = FastqSmallReport.FastqSmallReport() smallReport.createSmallReport(lib.forward, lib.reversed) if len(lib.forward) > 1: lib.forward = FastqCommands.MergeCommand(lib.outputDir, direction="forward", fastqFiles=lib.forward).execute() if lib.reversed != None: lib.reversed = FastqCommands.MergeCommand(lib.outputDir, direction="reversed", fastqFiles=lib.forward).execute() FastqSmallReport.FastqSmallReport().createSmallReport(lib.forward, lib.reversed) else: lib.forward = lib.forward[0] if lib.reversed != None: lib.reversed = lib.reversed[0] lib.avgReadlength = float(smallReport.fastqInfo[smallReport.fastqInfo.keys()[0]][2]) if lib.sequencingPlatform == "illumina": self.illuminaPreprocess(lib) elif lib.sequencingPlatform == "454": lib.forward = FastqMcfTrimming.FastqTrimmer(lib.outputDir, forward=lib.forward,noTrim=True).execute() FastqSmallReport.FastqSmallReport().createSmallReport(lib.forward, lib.reversed) self.filterContamination(lib)
def createStatistics(self, pool): """ The method createStatistics creates an assemblystatistics object for generating the statistics of the assembly and creates a fastq report for the raw and fully preprocessed data. """ logging.info("Creating assembly stats") assemblyController = AssemblyControl.AssemblyStatistics() assemblyController.AssemblyStatisticsOfPipeline(pool.outputDir + "statistics/", pool, self.assembly) LaTeX.ltxPart("Supplementary materials") for lib in pool.libs: self.createFastqReport("raw " + lib.libName, lib.rawForward, lib.rawReversed, lib.outputDir + "raw_qc/") self.createFastqReport("preprocessed " + lib.libName, lib.forward, lib.reversed, lib.outputDir + "preprocessed/") Reporter.instance.createReport(pool.outputDir + "report/")
def getLaTeXReport(self): table = LaTeX.ltxTable(2) table.addRow(["Total bp: ","{:,}".format(self.totalBp)]) table.addRow(["Peak: ",str(self.peak)]) table.addRow(["Mean base coverage: ", "{:.2f}".format(self.coverage)]) table.addRow(["",""]) table.addRow(["Unique good kmers: ","{:,}".format(self.unique_gkmers)]) table.addRow(["BGI genome size estimation: ", "{:,}".format(int(round(self.bgi)))]) table.addRow(["GSE kmers/peak: ","{:,}".format(self.kmersPerPeak)]) tex = table.getText() img = LaTeX.ltxImage(self.genSizeHistoPlot) tex = tex + img.getText() return tex
def getLaTeXReport(self): txt = "\\section{Fastqc of " +self.status.replace("_"," ")+"}\n" table = False imgs = 0 with open(self.outputFile) as reportReader: for line in reportReader: if "</h2>" in line and "Summary" not in line: txt = txt + "\\subsection*{"+re.findall(r"]\">(.*?)</h2>",line)[0]+"}\n" # if "Basic Statistics" in line: # smallReport = FastqSmallReport.FastqSmallReport() # smallReport.createSmallReport([self.forward], None) # txt = txt + smallReport.getLaTeXReport() # txt = txt + "\\\\" elif "<table>" in line: xml = "<table>" table = True elif "</table>" in line: xml = xml + "</table>" table = False domTable = minidom.parseString(xml) ltxTable = LaTeX.ltxTable(len(domTable.firstChild.firstChild.childNodes)) noOfRows = 0 for row in domTable.firstChild.childNodes: cols = [] noOfRows = noOfRows + 1 if noOfRows > 15: continue for col in row.childNodes: cols.append(col.firstChild.nodeValue.replace("%"," percent")) if "Filename" in cols[0]: continue ltxTable.addRow(cols) txt = txt + ltxTable.getText() if noOfRows > 15: txt = txt + "\\\\Total length of this table is "+ str(noOfRows) + ". The table is cut after 15 rows..." elif table == True: xml = xml + line.strip() elif "<img class=\"indented\"" in line: imgs = imgs +1 img = re.findall("src=\"(.*)\" alt=",line)[0] ltxImg = LaTeX.ltxImage(os.path.dirname(self.outputFile) + "/" + img) txt = txt + ltxImg.getText() if imgs % 2 == 0: txt = txt + "\\clearpage\n" return txt
def doAssembly(self, pool): """ The method doAssembly creates all objects to execute a wgs assembly. Afterwards the insert sizes of all pe and mp libraries are estimated. """ logging.info("Executing assembly") LaTeX.ltxSection("Assembly") if Configuration.instance.getGlobalOption("assembler") == None or Configuration.instance.getGlobalOption("assembler") == "wgs": assembler = WgsAssembler.WgsAssembler() self.assembly = assembler.doAssembly(pool.outputDir + "assembly/", pool) elif Configuration.instance.getGlobalOption("assembler") == "allpaths": self.assembly = AllpathsAssembler.AllpathsAssembler().doAssembly(pool.outputDir + "allPathsAssembly/", pool) for lib in pool.libs: if lib.reversed == None: continue logging.info("Calculating insert sizes for " + lib.libName) insertSizeChecker = InsertSizeChecker.InsertSizeChecker() insertSizeChecker.checkInsertSize(lib.outputDir, lib.rawForward, lib.rawReversed, self.assembly, lib.libName, lib.insertSize)
def doGenomeSizeEstimation(self, outputDir, pool): """ The method doGenomeSizeEstimation contains the mainflow of the genomesize estimation. This mainflow contains the following methods: * Execute Jellyfish count * Execute Jellyfish stats * Create a histogram of the unique kmers with Jellyfish histo * Draw a histogram of the unique kmers * Estimate the genome size with the BGI method """ logging.info("Starting genome size estimation") if not os.path.isdir(outputDir): os.makedirs(outputDir) LaTeX.ltxSection("Genome size estimation}") self.jellyFishCountsFile = JellyFish.JellyFishCount(outputDir, pool=pool).execute() self.jellyFishStatsFile = JellyFish.JellyFishStats(outputDir, jellyFishCountsFile=self.jellyFishCountsFile).execute() self.jellyfishHistoFile = JellyFish.JellyFishHisto(outputDir, jellyFishCountsFile=self.jellyFishCountsFile).execute() self.genSizeHistoPlot = outputDir + "kmer_graph.png" self.peak = int(self.drawHisto(self.jellyfishHistoFile, self.genSizeHistoPlot)) self.calculateGenomeSize(pool, self.jellyFishStatsFile, self.jellyfishHistoFile) Reporter.instance.objects.append(self)
def getLaTeXReport(self): txt = "" table = LaTeX.ltxTable(len(self.fastqInfo.values()[0]) + 1) table.addRow( [ "Fastq file", "Total bases", "number of reads", "AVG read length", "Percentage high quality bases ($qv > 30$)", ] ) for [index, fastqEntry] in self.fastqInfo.iteritems(): table.addRow([index] + fastqEntry) txt = txt + table.getText() return txt + "\\\\"
def getLaTeXReport(self): """ Convert all previously calculated statistics into LaTeX with this method. """ txt = "\\subsection{Statistics}\n" table = LaTeX.ltxTable(2) table.addRow(["Total sequences: ",str(self.totalSeqs)]) table.addRow(["Total length: ","{:,}".format(self.totalLen)]) table.addRow(["GC perc: ","{:.2f}".format(self.gcPerc) + "\%"]) table.addRow(["Longest sequence: ","{:,}".format(self.longestSeq)]) table.addRow(["N50 index: ","{:,}".format(self.n50Index)]) table.addRow(["N50: ","{:,}".format(self.n50)]) table.addRow(["",""]) table.addRow(["N90 index: ","{:,}".format(self.n90Index)]) table.addRow(["N90: ","{:,}".format(self.n90)]) if hasattr(self, "cegmaScore"): table.addRow(["",""]) table.addRow(["Cegma complete: ",self.cegmaScore[0] + "\%"]) table.addRow(["Cegma partial: ",self.cegmaScore[1] + "\%"]) for name, value in self.otherCegmaScores.iteritems(): table.addRow(["",""]) print value[0] table.addRow([name + " complete: ",value[0] + "\%"]) table.addRow([name + " partial: ",value[1] + "\%"]) if hasattr(self, "rawDnaMappingStats"): table.addRow(["",""]) table.addRow(["DNA reads: ","{:,}".format(int(self.rawDnaMappingStats["total"]))]) table.addRow(["Mapped: ",self.rawDnaMappingStats["mapped"] + "\%"]) if "propPair" in self.rawDnaMappingStats: table.addRow(["Properly paired",self.rawDnaMappingStats["propPair"] + "\%"]) table.addRow(["Error rate: ","{:.2f}".format(self.errorRate) + " SNPs per 10kb"]) table.addRow(["SNP density: ","{:.2f}".format(self.snpDensity) + " SNPs per 10kb"]) if hasattr(self, "rnaMappingStats"): table.addRow(["",""]) table.addRow(["RNA reads: ","{:,}".format(int(self.rnaMappingStats["total"]))]) table.addRow(["Mapped: ",self.rnaMappingStats["mapped"] + "\%"]) if "propPair" in self.rnaMappingStats: table.addRow(["Properly paired: ",self.rnaMappingStats["propPair"] + "\%"]) txt = txt + table.getText() txt = txt + "\\begin{figure}[h]\n" txt = txt + "\\includegraphics[scale=0.7]{" + self.a50Plot + "}\n" txt = txt + "\\end{figure}\n" return txt