def add_per_lane_statistics(self): """ Add a section with the per-lane statistics """ # Per-lane statistics if not os.path.exists(self._per_lane_stats_file): logger.debug("No per-lane statistics file found") return per_lane_stats = self.add_section("Per-lane statistics", name="per_lane_stats") stats = TabFile(self._per_lane_stats_file, first_line_is_header=True) tbl = Table(columns=stats.header()) tbl.append_columns("Assigned/unassigned") for line in stats: n = tbl.add_row() for c in stats.header(): if c in ("Total reads", "Assigned reads", "Unassigned reads"): value = pretty_print_reads(line[c]) else: value = line[c] tbl.set_value(n, c, value) tbl.set_value( n, "Assigned/unassigned", Img( ustackedbar( (line["Assigned reads"], line["Unassigned reads"]), length=100, height=15, colors=('red', 'white'), inline=True))) per_lane_stats.add(tbl) self.add_to_toc("Per-lane statistics", per_lane_stats)
def __init__(self, screen_file): """ Create a new FastqscreenData instance """ TabFile.__init__(self, column_names=( 'Library', '%Unmapped', '%One_hit_one_library', '%Multiple_hits_one_library', '%One_hit_multiple_libraries', '%Multiple_hits_multiple_libraries', )) self._screen_file = os.path.abspath(screen_file) self._version = None self._no_hits = None # Read in data with open(self._screen_file, 'r') as fp: for line in fp: line = line.strip() if line.startswith('#Fastq_screen version:'): self._version = line.split()[2] continue elif line.startswith('Library') or line.startswith('Genome'): tabfile = TabFile(column_names=line.split()) continue elif line.startswith('%Hit_no_libraries:') or \ line.startswith('%Hit_no_genomes:'): self._no_hits = float(line.split()[-1]) continue elif not line or \ line.startswith('#') or \ line.startswith('%'): continue tabfile.append(tabdata=line) # Handle different terminology for different versions if tabfile.header()[0] == 'Library': library = 'Library' unmapped = '%Unmapped' one_hit_one_library = '%One_hit_one_library' multiple_hits_one_library = '%Multiple_hits_one_library' one_hit_multiple_libraries = '%One_hit_multiple_libraries' multiple_hits_multiple_libraries = '%Multiple_hits_multiple_libraries' elif tabfile.header()[0] == 'Genome': library = 'Genome' unmapped = '%Unmapped' one_hit_one_library = '%One_hit_one_genome' multiple_hits_one_library = '%Multiple_hits_one_genome' one_hit_multiple_libraries = '%One_hit_multiple_genomes' multiple_hits_multiple_libraries = '%Multiple_hits_multiple_genomes' # Copy data to main object for line in tabfile: data = [ line[library], line[unmapped], line[one_hit_one_library], line[multiple_hits_one_library], line[one_hit_multiple_libraries], line[multiple_hits_multiple_libraries] ] self.append(data=data)
def __init__(self, filen=None, fp=None, name=None): """Create a new MacsXLS instance Arguments: filen: name of the file to read the MACS output from. If None then fp argument must be supplied instead. fp: file-like object opened for reading. If None then filen argument must be supplied instead. If both filen and fp are supplied then fp will be used preferentially. """ # Store data self.__filen = filen self.__name = name self.__macs_version = None self.__command_line = None self.__header = [] self.__data = None # Open file, if necessary if fp is None: fp = open(filen, 'r') else: filen = None # Iterate over header lines for line in fp: line = line.strip() if line.startswith('#') or line == '': # Header line self.__header.append(line) # Detect/extract data from header if line.startswith( "# This file is generated by MACS version "): # Look for MACS version self.__macs_version = line.split()[8] elif self.__name is None and line.startswith("# name = "): # Look for 'name' if none set self.__name = line[len("# name = "):] elif line.startswith("# Command line: "): # Look for command line self.__command_line = line[16:] else: if self.__data is None: # First line of actual data should be the column names columns = line.split('\t') # Insert an additional column called 'order' columns.insert(0, "order") # Set up TabFile to handle actual data self.__data = TabFile(column_names=columns) else: # Assume it's actual data and store it self.__data.append(tabdata="\t%s" % line) # Close the file handle, if we opened it if filen is not None: fp.close() # Check that we actually got a version line if self.macs_version is None: raise Exception, "Failed to extract MACS version, not a MACS output file?" # Populate the 'order' column self.update_order()
def __init__(self, well_list_file): """ Create a new ICell8WellList instance. Arguments: well_list_file (str): path to the well list file. """ self._data = TabFile(filen=well_list_file, first_line_is_header=True)
def __init__(self,fastq_strand_out): """ Create a new Fastqstrand instance """ self._fastq_strand_out = os.path.abspath(fastq_strand_out) self._version = None self._genomes = AttributeDictionary() # Read in data tabfile = None with open(self._fastq_strand_out,'r') as fp: for line in fp: line = line.strip() if line.startswith('#fastq_strand version:'): self._version = line.split()[2] continue elif line.startswith('#Genome'): tabfile = TabFile(column_names=line[1:].split('\t')) continue tabfile.append(tabdata=line) # Check there is some data if tabfile is None: raise Exception("Unable to extract fastq_strand data from %s" % self._fastq_strand_out) # Copy data to main object for line in tabfile: # Store the data data = AttributeDictionary() self._genomes[line['Genome']] = data data['forward'] = line['1st forward'] data['reverse'] = line['2nd reverse'] # Additional processing if data.reverse > 0.0: ratio = float(data.forward)/float(data.reverse) elif data.forward > 0.0: ratio = float("+inf") else: ratio = None if ratio is not None: if ratio < 0.2: strandedness = "reverse" elif ratio > 5 or ratio == float("+inf"): strandedness = "forward" else: strandedness = "unstranded?" else: strandedness = "undetermined" data['ratio'] = ratio data['strandedness'] = strandedness
def report_basic_stats(self,out_file=None,fp=None): """ Report the 'basic' statistics For each FASTQ file, report the following information: - Project name - Sample name - FASTQ file name (without leading directory) - Size (human-readable) - Nreads (number of reads) - Paired_end ('Y' for paired-end, 'N' for single-end) Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file,'w') else: fpp = fp # Report stats = TabFile(column_names=('Project', 'Sample', 'Fastq', 'Size', 'Nreads', 'Paired_end')) for line in self._stats: data = [line[c] for c in stats.header()] stats.append(data=data) stats.write(fp=fpp,include_header=True) # Close file if fp is None and out_file is not None: fpp.close()
def report_processing_qc(analysis_dir, html_file): """ Generate HTML report for processing statistics Arguments: analysis_dir (AnalysisDir): html_file (str): destination path and file name for HTML report """ # Initialise the HTML report processing_qc = Document("Processing report for %s" % os.path.basename(analysis_dir.analysis_dir)) processing_qc.add_css_rule(css_rules.QC_REPORT_CSS_RULES) processing_qc.add_css_rule("table { font-size: 80%;\n" " font-family: sans-serif; }") processing_qc.add_css_rule("td { text-align: right; }") # Add table of contents toc = processing_qc.add_section("Contents", name="toc") toc_list = List() toc.add(toc_list) # Per-lane statistics per_lane_stats_file = analysis_dir.params.per_lane_stats_file if per_lane_stats_file is None: per_lane_stats_file = "per_lane_statistics.info" if os.path.exists(per_lane_stats_file): per_lane_stats = processing_qc.add_section("Per-lane statistics", name="per_lane_stats") stats = TabFile(per_lane_stats_file, first_line_is_header=True) tbl = Table(columns=stats.header()) tbl.append_columns("Assigned/unassigned") for line in stats: n = tbl.add_row() for c in stats.header(): if c in ("Total reads", "Assigned reads", "Unassigned reads"): value = pretty_print_reads(line[c]) else: value = line[c] tbl.set_value(n, c, value) tbl.set_value( n, "Assigned/unassigned", Img( ustackedbar( (line["Assigned reads"], line["Unassigned reads"]), length=100, height=15, colors=('red', 'white'), inline=True))) per_lane_stats.add(tbl) toc_list.add_item(Link("Per-lane statistics", per_lane_stats)) # Per lane by sample statistics per_lane_sample_stats_file = "per_lane_sample_stats.info" if os.path.exists(per_lane_sample_stats_file): per_lane_sample_stats = processing_qc.add_section( "Per-lane statistics by sample", name="per_lane_sample_stats") lane_toc_list = List() per_lane_sample_stats.add(lane_toc_list) # Store the data for each lane with open("per_lane_sample_stats.info") as stats: lane_data = [] for line in stats: if line.startswith("Lane "): lane = int(line.split(' ')[-1]) lane_data.append({ 'lane': lane, 'total_reads': None, 'samples': [] }) elif line.startswith("Total reads = "): total_reads = int(line.split('=')[-1].strip()) lane_data[-1]['total_reads'] = total_reads elif line.startswith("- "): pname = line.split()[1].split('/')[0] sname = line.split()[1].split('/')[1] nreads = int(line.split()[2]) percreads = line.split()[3] lane_data[-1]['samples'].append({ 'pname': pname, 'sname': sname, 'nreads': nreads, 'percreads': percreads }) # Create a section and table for each lane for data in lane_data: lane = data['lane'] max_reads = max([d['nreads'] for d in data['samples']]) total_reads = data['total_reads'] s = per_lane_sample_stats.add_subsection( "Lane %d" % lane, name="per_lane_sample_stats_lane%d" % lane) lane_toc_list.add_item(Link("Lane %d" % lane, s)) current_project = None tbl = Table( columns=('pname', 'sname', 'nreads', 'percreads', 'barplot'), pname='Project', sname='Sample', nreads='Nreads', percreads='%reads', barplot='', ) s.add(tbl) for sample in data['samples']: pname = sample['pname'] sname = sample['sname'] nreads = sample['nreads'] percreads = sample['percreads'] if pname == current_project: pname = " " else: current_project = pname barplot = ustackedbar((nreads, max_reads - nreads), length=100, height=5, colors=('black', 'lightgrey'), bbox=False, inline=True) tbl.add_row(pname=pname, sname=sname, nreads=pretty_print_reads(nreads), percreads=percreads, barplot=Img(barplot)) tbl.add_row(pname="Total reads for lane %d" % lane, nreads=pretty_print_reads(total_reads)) toc_list.add_item( Link("Per-lane statistics by sample", per_lane_sample_stats), lane_toc_list) # Per fastq statistics stats_file = "statistics_full.info" if not os.path.exists(stats_file): if analysis_dir.params.stats_file is not None: stats_file = analysis_dir.params.stats_file else: stats_file = "statistics.info" if os.path.exists(stats_file): per_file_stats = processing_qc.add_section( "Per-file statistics by project", name="per_file_stats") project_toc_list = List() per_file_stats.add(project_toc_list) stats = TabFile(stats_file, first_line_is_header=True) projects = sorted(list(set([d['Project'] for d in stats]))) lanes = filter(lambda c: c.startswith('L'), stats.header()) sample = None for project in projects: subset = filter(lambda d: d['Project'] == project, stats) subset_lanes = filter( lambda l: reduce(lambda x, y: x or bool(y), [d[l] for d in subset], False), lanes) s = per_file_stats.add_subsection("%s" % project, name="per_file_stats_%s" % project) project_toc_list.add_item(Link("%s" % project, s)) tbl = Table(columns=('Sample', 'Fastq', 'Size')) if subset_lanes: tbl.append_columns(*subset_lanes) tbl.append_columns('Barplot', 'Nreads') s.add(tbl) for line in subset: if sample == line['Sample']: sname = " " else: sample = line['Sample'] sname = sample data = { 'Sample': sname, 'Fastq': line['Fastq'], 'Size': line['Size'], 'Nreads': (pretty_print_reads(line['Nreads']) if line['Nreads'] != '' else '') } for l in subset_lanes: data[l] = (pretty_print_reads(line[l]) if line[l] != '' else '') barplot = ustackedbar(filter(lambda n: n != '', [line[l] for l in subset_lanes]), length=100, height=10, colors=('grey', 'lightgrey'), bbox=True, inline=True) data['Barplot'] = Img(barplot) tbl.add_row(**data) toc_list.add_item( Link("Per-file statistics by project", per_file_stats), project_toc_list) # Write the processing QC summary file processing_qc.write(html_file)
# line then use "XLS_<input_name>.xls" if len(args) == 2: xls_out = args[1] else: # MACS output file might already have an .xls extension # but we'll add an explicit .xls extension xls_out = "XLS_" + os.path.splitext( os.path.basename(macs_in))[0] + ".xls" print "Input file: %s" % macs_in print "Output XLS: %s" % xls_out # Extract the header from the MACS and feed actual data to # TabFile object header = [] data = TabFile(column_names=[ 'chr', 'start', 'end', 'length', 'summit', 'tags', '-10*log10(pvalue)', 'fold_enrichment', 'FDR(%)' ]) fp = open(macs_in, 'r') for line in fp: if line.startswith('#') or line.strip() == '': # Header line header.append(line.strip()) else: # Data data.append(tabdata=line.strip()) fp.close() # Temporarily remove first line header_line = str(data[0]) del (data[0])
def _get_data(self, filen=None): """ Collect statistics for FASTQ outputs from an Illumina run """ # Collect FASTQ files fastqstats = [] for project in self._illumina_data.projects: for sample in project.samples: for fastq in sample.fastq: fastqstats.append( FastqStats(os.path.join(sample.dirn, fastq), project.name, sample.name)) # Gather same information for undetermined reads (if present) if self._illumina_data.undetermined is not None: for lane in self._illumina_data.undetermined.samples: for fastq in lane.fastq: fastqstats.append( FastqStats(os.path.join(lane.dirn, fastq), self._illumina_data.undetermined.name, lane.name)) # Collect the data for each file if self._n_processors > 1: # Multiple cores pool = Pool(self._n_processors) results = pool.map(collect_fastq_data, fastqstats) pool.close() pool.join() else: # Single core results = map(collect_fastq_data, fastqstats) # Set up tabfile to hold pre-existing data if filen is not None: existing_stats = TabFile(filen, first_line_is_header=True) else: existing_stats = None # Set up class to hold all collected data self._stats = TabFile(column_names=('Project', 'Sample', 'Fastq', 'Size', 'Nreads', 'Paired_end', 'Read_number')) # Split result sets into R1 and R2 results_r1 = filter(lambda f: f.read_number == 1, results) results_r2 = filter(lambda f: f.read_number == 2, results) # Determine which lanes are present and append # columns for each lanes = set() for fastq in results_r1: logger.debug("-- %s: lanes %s" % (fastq.name, ','.join([str(l) for l in fastq.lanes]))) for lane in fastq.lanes: lanes.add(lane) # Add lane numbers from pre-existing stats file if existing_stats is not None: for c in existing_stats.header(): if c.startswith('L'): lanes.add(int(c[1:])) self._lanes = sorted(list(lanes)) logger.debug("Lanes found: %s" % ','.join([str(l) for l in self._lanes])) for lane in self._lanes: self._stats.appendColumn("L%s" % lane) # Copy pre-existing stats into new tabfile if existing_stats: for line in existing_stats: data = [ line['Project'], line['Sample'], line['Fastq'], line['Size'], line['Nreads'], line['Paired_end'], line['Read_number'] ] for lane in lanes: try: data.append(line["L%s" % lane]) except: data.append('') self._stats.append(data=data) # Copy reads per lane from R1 FASTQs into R2 for r2_fastq in results_r2: # Get corresponding R1 name logger.debug("-- Fastq R2: %s" % r2_fastq.name) r1_fastq_name = IlluminaFastq(r2_fastq.name) r1_fastq_name.read_number = 1 r1_fastq_name = str(r1_fastq_name) logger.debug("-- -> R1: %s" % r1_fastq_name) # Locate corresponding data r1_fastq = filter(lambda f: f.name.startswith(r1_fastq_name), results_r1)[0] r2_fastq.reads_by_lane = dict(r1_fastq.reads_by_lane) # Write the data into the tabfile paired_end = ('Y' if self._illumina_data.paired_end else 'N') for fastq in results: # Check for existing entry existing_entry = False for line in self._stats: if (line['Project'] == fastq.project and line['Sample'] == fastq.sample and line['Fastq'] == fastq.name): # Overwrite the existing entry existing_entry = True break # Write the data if not existing_entry: # Append new entry data = [ fastq.project, fastq.sample, fastq.name, bcf_utils.format_file_size(fastq.fsize), fastq.nreads, paired_end, fastq.read_number ] for lane in lanes: try: data.append(fastq.reads_by_lane[lane]) except: data.append('') self._stats.append(data=data) else: # Overwrite existing entry logging.warning("Overwriting exisiting entry for " "%s/%s/%s" % (fastq.project, fastq.sample, fastq.name)) line['Size'] = bcf_utils.format_file_size(fastq.fsize) line['Nreads'] = fastq.nreads line['Paired_end'] = paired_end line['Read_number'] = fastq.read_number for lane in lanes: lane_name = "L%d" % lane try: line[lane_name] = fastq.reads_by_lane[lane] except: line[lane_name] = ''
def report_per_lane_summary_stats(self, out_file=None, fp=None): """ Report summary of total and unassigned reads per-lane Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file, 'w') else: fpp = fp # Set up TabFile to hold the data collected per_lane_stats = TabFile(column_names=('Lane', 'Total reads', 'Assigned reads', 'Unassigned reads', '%assigned', '%unassigned')) # Initialise counts for each lane assigned = {} unassigned = {} for lane in self.lane_names: assigned[lane] = 0 unassigned[lane] = 0 # Count assigned and unassigned (= undetermined) reads for line in filter( lambda x: x['Read_number'] == 1 and not IlluminaFastq(x[ 'Fastq']).is_index_read, self._stats): if line['Project'] != 'Undetermined_indices': counts = assigned else: counts = unassigned for lane in self.lane_names: if line[lane]: try: counts[lane] += line[lane] except KeyError: counts[lane] = line[lane] # Write out data for each lane for lane in self.lane_names: lane_number = int(lane[1:]) assigned_reads = assigned[lane] try: unassigned_reads = unassigned[lane] except KeyError: # lane doesn't have any unassigned reads unassigned_reads = 0 total_reads = assigned_reads + unassigned_reads if total_reads > 0: percent_assigned = float(assigned_reads)/ \ float(total_reads)*100.0 percent_unassigned = float(unassigned_reads)/ \ float(total_reads)*100.0 else: percent_assigned = 0.0 percent_unassigned = 0.0 per_lane_stats.append(data=("Lane %d" % lane_number, total_reads, assigned_reads, unassigned_reads, "%.2f" % percent_assigned, "%.2f" % percent_unassigned)) # Write to file per_lane_stats.write(fp=fpp, include_header=True) # Close file if fp is None and out_file is not None: fpp.close()
# Remove the working directory shutil.rmtree(working_dir) # Report the stats if args.stats_file is not None: # Output column names stats_file = os.path.abspath(args.stats_file) nreads_col = "Nreads%s" % ('' if args.suffix is None else args.suffix) umis_col = "Distinct_UMIs%s" % ('' if args.suffix is None else args.suffix) if not (os.path.isfile(stats_file) and args.append): # Create new stats file if well_list is not None: # Initialise barcode and sample names from well list stats_data = TabFile(column_names=('Barcode', 'Sample')) for barcode in well_list.barcodes(): stats_data.append(data=(barcode, well_list.sample(barcode))) else: # Barcodes from collected data stats_data = TabFile(column_names=('Barcode', )) for barcode in stats.barcodes(): stats_data.append(data=(barcode, )) else: # Append to an existing file stats_data = TabFile(filen=stats_file, first_line_is_header=True) # Add new columns of data stats_data.appendColumn(nreads_col) stats_data.appendColumn(umis_col) # Populate columns
def report_processing_qc(analysis_dir, html_file): """ Generate HTML report for processing statistics Arguments: analysis_dir (AutoProcess): AutoProcess instance for the directory to report the processing from html_file (str): destination path and file name for HTML report """ # Initialise the HTML report processing_qc = Document("Processing report for %s" % os.path.basename(analysis_dir.analysis_dir)) processing_qc.add_css_rule(css_rules.QC_REPORT_CSS_RULES) processing_qc.add_css_rule("table { font-size: 80%;\n" " font-family: sans-serif; }") processing_qc.add_css_rule("td { text-align: right; }") processing_qc.add_css_rule("p.warning { padding: 5px;\n" " border: solid 1px red;\n" " background-color: F5BCA9;\n" " color: red;\n" " font-weight: bold;\n" " border-radius: 10px;\n" " display: inline-block; }") processing_qc.add_css_rule(".warnings { padding: 2px;\n" " border: solid 3px red;\n" " background-color: F5BCA9;\n" " color: red;\n" " font-weight: bold;\n" " margin: 10px;\n" " border-radius: 10px;\n" " display: inline-block; }") processing_qc.add_css_rule("img { vertical-align: middle; }") processing_qc.add_css_rule(".hide { display: none; }") # Add table of contents toc = processing_qc.add_section("Contents", name="toc") toc_list = List() toc.add(toc_list) # Add warnings section # This will be hidden if there are no issues status = True warnings = processing_qc.add_section(css_classes=("warnings", )) warnings.add( Para(WarningIcon(size=50), "There are issues with one or more lanes or samples")) # Per-lane statistics per_lane_stats_file = analysis_dir.params.per_lane_stats_file if per_lane_stats_file is None: per_lane_stats_file = "per_lane_statistics.info" per_lane_stats_file = get_absolute_file_path( per_lane_stats_file, base=analysis_dir.analysis_dir) if os.path.exists(per_lane_stats_file): per_lane_stats = processing_qc.add_section("Per-lane statistics", name="per_lane_stats") stats = TabFile(per_lane_stats_file, first_line_is_header=True) tbl = Table(columns=stats.header()) tbl.append_columns("Assigned/unassigned") for line in stats: n = tbl.add_row() for c in stats.header(): if c in ("Total reads", "Assigned reads", "Unassigned reads"): value = pretty_print_reads(line[c]) else: value = line[c] tbl.set_value(n, c, value) tbl.set_value( n, "Assigned/unassigned", Img( ustackedbar( (line["Assigned reads"], line["Unassigned reads"]), length=100, height=15, colors=('red', 'white'), inline=True))) per_lane_stats.add(tbl) toc_list.add_item(Link("Per-lane statistics", per_lane_stats)) # Per lane by sample statistics per_lane_sample_stats_file = get_absolute_file_path( "per_lane_sample_stats.info", base=analysis_dir.analysis_dir) if os.path.exists(per_lane_sample_stats_file): per_lane_sample_stats = processing_qc.add_section( "Per-lane statistics by sample", name="per_lane_sample_stats") lane_toc_list = List() per_lane_sample_stats.add(lane_toc_list) # Store the data for each lane lane_data = list() with open(per_lane_sample_stats_file, 'r') as stats: for line in stats: if line.startswith("Lane "): lane = int(line.split(' ')[-1]) lane_data.append({ 'lane': lane, 'total_reads': None, 'samples': [] }) elif line.startswith("Total reads = "): total_reads = int(line.split('=')[-1].strip()) lane_data[-1]['total_reads'] = total_reads elif line.startswith("- "): pname = line.split()[1].split('/')[0] sname = line.split()[1].split('/')[1] nreads = int(line.split()[2]) percreads = line.split()[3] lane_data[-1]['samples'].append({ 'pname': pname, 'sname': sname, 'nreads': nreads, 'percreads': percreads }) # Create a section and table for each lane for data in lane_data: lane = data['lane'] s = per_lane_sample_stats.add_subsection( "Lane %d" % lane, name="per_lane_sample_stats_lane%d" % lane) # Check for problems has_warnings = False if not data['samples']: # No samples reported s.add( Para(WarningIcon(), "No samples reported for this lane", css_classes=('warning', ))) has_warnings = True elif min([d['nreads'] for d in data['samples']]) == 0: # There are samples with no reads s.add( Para(WarningIcon(), "One or more samples with no reads", css_classes=('warning', ))) has_warnings = True # Add link to lane for lane ToC link = Link("Lane %d" % lane, s) if not has_warnings: lane_toc_list.add_item(link) else: lane_toc_list.add_item(WarningIcon(), link) status = False # Write out the data, if there is any if not data['samples']: continue max_reads = max([d['nreads'] for d in data['samples']]) total_reads = data['total_reads'] current_project = None tbl = Table( columns=('pname', 'sname', 'nreads', 'percreads', 'barplot'), pname='Project', sname='Sample', nreads='Nreads', percreads='%reads', barplot='', ) s.add(tbl) # Sort the sample data into order of sample name samples = sorted([s for s in data['samples']], key=lambda s: split_sample_name(s['sname'])) # Write the table for sample in samples: pname = sample['pname'] sname = sample['sname'] nreads = sample['nreads'] percreads = sample['percreads'] if pname == current_project: pname = " " else: current_project = pname barplot = ustackedbar((nreads, max_reads - nreads), length=100, height=5, colors=('black', 'lightgrey'), bbox=False, inline=True) if nreads == 0: sname = Para(WarningIcon(), sname) tbl.add_row(pname=pname, sname=sname, nreads=pretty_print_reads(nreads), percreads=percreads, barplot=Img(barplot)) tbl.add_row(pname="Total reads for lane %d" % lane, nreads=pretty_print_reads(total_reads)) # Add link to section from main ToC toc_list.add_item( Link("Per-lane statistics by sample", per_lane_sample_stats), lane_toc_list) # Per fastq statistics stats_file = get_absolute_file_path("statistics_full.info", base=analysis_dir.analysis_dir) if not os.path.exists(stats_file): if analysis_dir.params.stats_file is not None: stats_file = analysis_dir.params.stats_file else: stats_file = "statistics.info" stats_file = get_absolute_file_path(stats_file, base=analysis_dir.analysis_dir) if os.path.exists(stats_file): per_file_stats = processing_qc.add_section( "Per-file statistics by project", name="per_file_stats") project_toc_list = List() per_file_stats.add(project_toc_list) stats = TabFile(stats_file, first_line_is_header=True) projects = sorted(list(set([d['Project'] for d in stats]))) lanes = filter(lambda c: c.startswith('L'), stats.header()) sample = None for project in projects: # Get subset of lines for this project subset = sorted(filter(lambda d: d['Project'] == project, stats), key=lambda l: split_sample_name(l['Sample'])) # Work out which lanes are included subset_lanes = filter( lambda l: reduce(lambda x, y: x or bool(y), [d[l] for d in subset], False), lanes) # Add a new section for this project s = per_file_stats.add_subsection("%s" % project, name="per_file_stats_%s" % project) # Check for problems has_warnings = False for line in subset: nreads = filter(lambda n: n != '', [line[l] for l in subset_lanes]) if not nreads or min(nreads) == 0: s.add( Para(WarningIcon(), "One or more Fastqs with zero " "read counts in one or lanes", css_classes=('warning', ))) has_warnings = True break # Add link to project from ToC link = Link("%s" % project, s) if not has_warnings: project_toc_list.add_item(link) else: project_toc_list.add_item(WarningIcon(), link) status = False # Build the data of data tbl = Table(columns=('Sample', 'Fastq', 'Size')) if subset_lanes: tbl.append_columns(*subset_lanes) tbl.append_columns('Barplot', 'Nreads') s.add(tbl) for line in subset: if sample == line['Sample']: sname = " " else: sample = line['Sample'] sname = sample data = { 'Sample': sname, 'Fastq': line['Fastq'], 'Size': line['Size'], 'Nreads': (pretty_print_reads(line['Nreads']) if line['Nreads'] != '' else '') } for l in subset_lanes: data[l] = (pretty_print_reads(line[l]) if line[l] != '' else '') nreads = filter(lambda n: n != '', [line[l] for l in subset_lanes]) if not nreads: nreads = [ 0, ] if min(nreads) == 0: # Add warning icon to Fastq with no reads in # at least one lane data['Fastq'] = Para(WarningIcon(), data['Fastq']) barplot = ustackedbar(nreads, length=100, height=10, colors=('grey', 'lightgrey'), bbox=True, inline=True) data['Barplot'] = Img(barplot) tbl.add_row(**data) toc_list.add_item( Link("Per-file statistics by project", per_file_stats), project_toc_list) # Set the visibility of the warning header if status: warnings.add_css_classes("hide") # Add an non-visible section that the publisher can # read to determine if there were problems s = processing_qc.add_section(name="status", css_classes=("hide", )) s.add("Status: %s" % ('OK' if status else 'WARNINGS', )) # Write the processing QC summary file processing_qc.write(html_file)
def add_per_fastq_statistics(self): """ Add a section with the per-Fastq statistics """ # Per fastq statistics if not os.path.exists(self._stats_file): logger.debug("No per-Fastq statistics file found") return per_file_stats = self.add_section("Per-file statistics by project", name="per_file_stats") project_toc_list = List() per_file_stats.add(project_toc_list) stats = TabFile(self._stats_file, first_line_is_header=True) projects = sorted(list(set([d['Project'] for d in stats]))) lanes = [c for c in stats.header() if c.startswith('L')] sample = None for project in projects: # Get subset of lines for this project subset = sorted([d for d in stats if d['Project'] == project], key=lambda l: split_sample_name(l['Sample'])) # Determine which lanes this project appears in subset_lanes = [] for l in lanes: for d in subset: if d[l]: subset_lanes.append(l) break # Add a new section for this project s = per_file_stats.add_subsection("%s" % project, name="per_file_stats_%s" % project) # Check for problems has_warnings = False for line in subset: nreads = [line[l] for l in subset_lanes if line[l] != ''] if not nreads or min(nreads) == 0: s.add( self.warning("One or more Fastqs with zero read " "counts in one or more lanes")) has_warnings = True break # Add link to project from ToC link = Link("%s" % project, s) if not has_warnings: project_toc_list.add_item(link) else: project_toc_list.add_item(WarningIcon(), link) self.flag_warnings() # Build the data of data tbl = Table(columns=('Sample', 'Fastq', 'Size')) if subset_lanes: tbl.append_columns(*subset_lanes) tbl.append_columns('Barplot', 'Nreads') s.add(tbl) for line in subset: if sample == line['Sample']: sname = " " else: sample = line['Sample'] sname = sample data = { 'Sample': sname, 'Fastq': line['Fastq'], 'Size': line['Size'], 'Nreads': (pretty_print_reads(line['Nreads']) if line['Nreads'] != '' else '') } for l in subset_lanes: data[l] = (pretty_print_reads(line[l]) if line[l] != '' else '') nreads = [line[l] for l in subset_lanes if line[l] != ''] if not nreads: nreads = [ 0, ] if min(nreads) == 0: # Add warning icon to Fastq with no reads in # at least one lane data['Fastq'] = Para(WarningIcon(), data['Fastq']) barplot = ustackedbar(nreads, length=100, height=10, colors=('grey', 'lightgrey'), bbox=True, inline=True) data['Barplot'] = Img(barplot) tbl.add_row(**data) # Add to table of contents self.add_to_toc("Per-file statistics by project", per_file_stats, project_toc_list)