def make_key_genes_cov_report(experiment_by_key): info('Making key genes coverage report...') ms = [ Metric('Gene'), Metric('Chr', with_heatmap=False, max_width=20, align='right') ] for i, (k, e) in enumerate(experiment_by_key.items()): ms.extend([ Metric(k + ' Ave depth', short_name=k + '\nave depth', med=e.ave_depth, class_='shifted_column' if i == 0 else ''), Metric(k + ' % cov at {}x'.format(e.depth_cutoff), short_name='% at {}x'.format(e.depth_cutoff), unit='%', med=1, low_inner_fence=0.5, low_outer_fence=0.1), Metric(k + ' CNV', short_name=' CNV') ] # short name is hack for IE9 who doesn't have "text-align: left" and tries to stick "CNV" to the previous col header ) clinical_cov_metric_storage = MetricStorage( sections=[ReportSection(metrics=ms)]) key_genes_report = PerRegionSampleReport( sample=experiment_by_key.values()[0].sample, metric_storage=clinical_cov_metric_storage) # Writing records hits_by_gene_by_experiment = OrderedDefaultDict(OrderedDict) for k, e in experiment_by_key.items(): for gene in e.key_gene_by_name.values(): hits_by_gene_by_experiment[gene.name][e] = gene for gname, hit_by_experiment in sorted( hits_by_gene_by_experiment.items(), key=lambda (gname, h): gname): gene = next( (m for m in hit_by_experiment.values() if m is not None), None) row = key_genes_report.add_row() row.add_record('Gene', gene.name) row.add_record('Chr', gene.chrom.replace('chr', '')) for e, hit in hit_by_experiment.items(): row.add_record(e.key + ' Ave depth', hit.ave_depth) m = clinical_cov_metric_storage.find_metric( e.key + ' % cov at {}x'.format(e.depth_cutoff)) row.add_record( m.name, next((cov for cutoff, cov in hit.cov_by_threshs.items() if cutoff == e.depth_cutoff), None)) if hit.seq2c_event and (hit.seq2c_event.is_amp() or hit.seq2c_event.is_del()): row.add_record( e.key + ' CNV', hit.seq2c_event.amp_del + ', ' + hit.seq2c_event.fragment) return key_genes_report
def _prep_comb_report(metric_storage, samples, shared_general_metrics, shared_metrics): comb_general_metrics = shared_general_metrics[:] comb_general_metrics.append(Metric('For each sample')) for s in samples: comb_general_metrics.append(Metric(s.name + ' ave depth')) comb_metrics = shared_metrics[:] for s in samples: comb_metrics.append( DepthsMetric(s.name + ' hotspots depths/norm depths', short_name=s.name)) comb_report_metric_storage = MetricStorage( general_section=ReportSection('general_section', metrics=comb_general_metrics), sections=[ReportSection(metrics=comb_metrics)]) report = PerRegionSampleReport(sample='Combined', metric_storage=comb_report_metric_storage) report.add_record( 'Sample', 'contains values from all samples: ' + ', '.join([s.name for s in samples])) report.add_record('For each sample', 'Depths and normalized depths for each hotspot.') m = metric_storage.find_metric('Average sample depth') for s in samples: val = BaseReport.find_record(s.report.records, m.name).value report.add_record(s.name + ' ave depth', val) return report
def _get_targqc_metric_storage(metric_storages_by_report_type): class SectionId: def __init__(self, name, title): self.name = name self.title = title def __hash__(self): #return hash((self.name, self.title)) return hash(self.name) # use title from the first metric_storage def __eq__(self, other): #return (self.name, self.title) == (other.name, other.title) return self.name == other.name # use title from the first metric_storage metrics_by_sections = OrderedDict() general_section_id = None general_section_metric_list = [] for report_type, metric_storage in metric_storages_by_report_type: for section in metric_storage.sections: section_id = SectionId(section.name, section.title) if section_id not in metrics_by_sections.keys(): metrics_by_sections[section_id] = [] metrics_by_sections[section_id] += [ metric for metric in metric_storage.get_metrics( sections=[section], skip_general_section=True) if metric == _get_targqc_metric( metric, dict(metric_storages_by_report_type)['targetcov'], report_type) ] # specific behaviour for general section general_section_metric_list += [ metric for metric in metric_storage.general_section.metrics if metric == _get_targqc_metric( metric, dict(metric_storages_by_report_type)['targetcov'], report_type) ] if not general_section_id: general_section_id = SectionId( metric_storage.general_section.name, metric_storage.general_section.title) sections = [] for section_id, metric_list in metrics_by_sections.items(): sections.append( ReportSection(section_id.name, section_id.title, metric_list)) return MetricStorage(general_section=ReportSection( general_section_id.name, general_section_id.title, general_section_metric_list), sections=sections)
def make_expression_heatmap(bcbio_structure, gene_counts): samples_names = [sample.name for sample in bcbio_structure.samples] metrics = [Metric('Gene')] + [ Metric(sample_name, max_width=40) for sample_name in samples_names ] metric_storage = MetricStorage( sections=[ReportSection(metrics=metrics, name='samples')]) report = PerRegionSampleReport(metric_storage=metric_storage, expandable=True, unique=True, heatmap_by_rows=True, keep_order=True, large_table=True, vertical_sample_names=True) printed_genes = set() # Writing records for gene in sorted(gene_counts.keys()): first_record = gene_counts[gene][0] if first_record.is_hidden_row: printed_genes.add(first_record.gene_name) row = report.add_row() row.add_record('Gene', first_record.gene_name) for sample in samples_names: row.add_record( sample, sum([ record.counts[sample] for record in gene_counts[gene] ])) row.class_ = ' expandable_gene_row collapsed' for record in gene_counts[gene]: gene_expression = record.counts row = report.add_row() if record.is_hidden_row: row_class = ' row_to_hide row_hidden' else: row_class = ' expandable_gene_row collapsed' row.add_record('Gene', record.name) for sample, count in gene_expression.iteritems(): if sample in samples_names: row.add_record(sample, count) row.class_ = row_class return report
Metric('Strand'), Metric('Feature'), Metric('Biotype'), Metric('ID'), Metric('Hotspots num', short_name='#HS'), VariantsMetric('Hotspots list', short_name='Hotspots') ] shared_general_metrics = [Metric('Sample', short_name='Sample', common=True)] single_report_metric_storage = MetricStorage( general_section=ReportSection( 'general_section', '', metrics=shared_general_metrics + [Metric('Average sample depth', short_name='Ave depth', common=True)]), sections=[ ReportSection(metrics=shared_metrics + [ DepthsMetric('Hotspots depths/norm depths', short_name='DP/Norm_DP') ]) ]) tricky_regions_fnames_d = { 'bad_promoter': 'Bad promoter', 'gc0to15': 'GC 0-15%', 'gc15to20': 'Low GC 15-20%', 'gc20to25': 'Low GC 20-25%', 'gc25to30': 'Low GC 25-30%', 'gc65to70': 'High GC 65-70%', 'gc70to75': 'High GC 70-75%', 'gc75to80': 'High GC 75-80%',
def _generate_summary_flagged_regions_report(cnf, bcbio_structure, samples, mutations, key_or_target_genes): region_types = ['exons', 'target'] coverage_types = ['low', 'high'] flagged_regions_metrics = [ Metric('Gene', min_width=50, max_width=70), Metric('Chr', with_heatmap=False, max_width=20, align='right'), Metric('Position', td_class='td_position', min_width=70, max_width=120), Metric('Ave depth', td_class='long_expanded_line right_aligned', max_width=100, with_heatmap=False), Metric('#HS', quality='Less is better', align='right', max_width=30), Metric('Hotspots & Deleterious', td_class='long_expanded_line', min_width=100, max_width=150), Metric('Found mutations', td_class='long_expanded_line', min_width=150, max_width=200), Metric('Samples', td_class='long_expanded_line', min_width=100, max_width=120), Metric('Possible reasons', td_class='long_expanded_line', max_width=120) ] flagged_regions_metric_storage = MetricStorage( sections=[ReportSection(metrics=flagged_regions_metrics)]) flagged_regions_report_dirpath = bcbio_structure.flagged_regions_dirpath safe_mkdir(flagged_regions_report_dirpath) if key_or_target_genes == 'target': genes_description = 'genes' else: genes_description = 'genes that have been previously implicated in various cancers' for region_type in region_types: regions_dict = {} total_regions = 0 info() info('Preparing report for ' + region_type) for coverage_type in coverage_types: regions_by_gene = {} for sample in samples: selected_regions_bed_fpath = join( sample.flagged_regions_dirpath, coverage_type + '_cov_' + region_type + '.bed') regions_by_reasons = {} if verify_file(selected_regions_bed_fpath, is_critical=False): intersection_fpath = _intersect_with_tricky_regions( cnf, selected_regions_bed_fpath, sample.name) regions_by_reasons = _parse_intersection_with_tricky_regions( cnf, intersection_fpath) total_report_fpath = add_suffix( add_suffix(sample.flagged_tsv, region_type), coverage_type) if verify_file(total_report_fpath, is_critical=False): with open(total_report_fpath) as f: for l in f: l = l.strip() if not l or l.startswith('#'): continue fs = l.split('\t') (chrom, start, end, size, gene, strand, feature, biotype, min_depth, avg_depth) = fs[:10] start, end = int(start), int(end) regions_by_gene.setdefault(gene, []) cur_region = Region(sample_name=[sample.name], avg_depth=[avg_depth], gene_name=gene, strand=strand, feature=feature, biotype=biotype, chrom=chrom, start=start, end=end) for r in regions_by_reasons: if r[0] <= start and end <= r[1]: cur_region.extra_fields = regions_by_reasons[ r] cur_region.missed_by_db = [] was_added = False for r in regions_by_gene[gene]: if r.start <= cur_region.start <= r.end and r.start <= cur_region.end <= r.end: was_added = True if sample.name not in r.sample_name: r.sample_name.append(sample.name) r.avg_depth.append(avg_depth) if not was_added: regions_by_gene[gene].append(cur_region) report_fpath = join( sample.flagged_regions_dirpath, coverage_type + '_cov_' + region_type + '.oncomine.tsv') if verify_file(report_fpath, is_critical=False): with open(report_fpath) as f: for l in f: l = l.strip() if not l or l.startswith('#'): continue fs = l.split('\t') hotspots = [] (gene, chrom, start, end, strand, feature, biotype, id_, num_hotspots) = fs[:9] start, end = int(start), int(end) if int(num_hotspots) != 0: hotspots = fs[9].split() regions_by_gene.setdefault(gene, []) cur_region = Region(sample_name=[sample.name], gene_name=gene, strand=strand, feature=feature, biotype=biotype, chrom=chrom, start=start, end=end) for r in regions_by_gene[gene]: if r.start <= cur_region.start <= r.end and r.start <= cur_region.end <= r.end: if sample.name not in r.sample_name: r.sample_name.append(sample.name) r.avg_depth.append('.') new_hotspots = [ hs for hs in hotspots if hs not in r.missed_by_db ] r.missed_by_db.extend(new_hotspots) flagged_regions_report = PerRegionSampleReport( name='Flagged regions', metric_storage=flagged_regions_metric_storage) num_regions = 0 non_hs_class = ' no_hotspots' slash_with_zero_space = '/​' for gene in regions_by_gene.keys(): if regions_by_gene[gene]: num_regions += len(regions_by_gene[gene]) row_class = ' expandable_row collapsed' if len(regions_by_gene[gene]) > 1: reg = flagged_regions_report.add_row() reg.class_ = ' expandable_gene_row collapsed' chr = regions_by_gene[gene][0].chrom num_hotspots = [ len(r.missed_by_db) for r in regions_by_gene[gene] ] all_samples = [ sample for r in regions_by_gene[gene] for sample in r.sample_name ] all_unique_samples = [] all_unique_samples = [ sample for sample in all_samples if sample not in all_unique_samples and not all_unique_samples.append(sample) ] all_tricky_regions = sorted( set([ tricky_region for r in regions_by_gene[gene] for tricky_region in r.extra_fields ])) all_depths = [[] for x in range(len(all_unique_samples))] for r in regions_by_gene[gene]: for sample_num, sample in enumerate( all_unique_samples): if sample in r.sample_name: cur_sample_index = r.sample_name.index( sample) if r.avg_depth[cur_sample_index] != '.': all_depths[sample_num].append( float( r.avg_depth[cur_sample_index])) avg_depth_per_samples = [ sum(all_depths[i]) / len(all_depths[i]) if len(all_depths[i]) > 0 else 0 for i in range(len(all_depths)) ] reg.add_record('Gene', gene) reg.add_record('Chr', chr.replace('chr', '')) reg.add_record('#HS', sum(num_hotspots)) reg.add_record( 'Position', str(len(regions_by_gene[gene])) + ' regions') reg.add_record( 'Ave depth', slash_with_zero_space.join([ format(depth, '.2f') if depth != '.' else '.' for depth in avg_depth_per_samples ]), num=sum(avg_depth_per_samples) / len(avg_depth_per_samples)) reg.add_record('Hotspots & Deleterious', '') reg.add_record('Possible reasons', ', '.join(all_tricky_regions)) reg.add_record('Samples', ',\n'.join(all_unique_samples)) reg.add_record('Found mutations', '') if sum(num_hotspots) == 0: reg.class_ += non_hs_class row_class += ' row_to_hide row_hidden' else: row_class += ' not_to_hide' for r in regions_by_gene[gene]: reg = flagged_regions_report.add_row() reg.class_ = row_class reg.add_record('Gene', r.gene_name) reg.add_record('Chr', r.chrom.replace('chr', '')) avg_depths = [ float(depth) for depth in r.avg_depth if depth != '.' ] reg.add_record( 'Ave depth', slash_with_zero_space.join([ format(depth, '.2f') if depth != '.' else depth for depth in avg_depths ]), num=sum(avg_depths) / len(avg_depths)) reg.add_record( 'Position', Metric.format_value( r.start, human_readable=True, is_html=True) + '-' + Metric.format_value( r.end, human_readable=True, is_html=True)) reg.add_record('#HS', len(r.missed_by_db)) if len(r.missed_by_db) == 0: reg.class_ += non_hs_class uniq_hs_positions = sorted( set([ hotspot.split(':')[0] for hotspot in r.missed_by_db ])) hs_by_pos = { pos: [ h.split(':')[1] for h in r.missed_by_db if h.split(':')[0] == pos ] for pos in uniq_hs_positions } hs_breakable = [ gray( Metric.format_value(int(pos.replace(',', '')), human_readable=True, is_html=True)) + ': ' + ','.join([ h.replace('/', slash_with_zero_space) for h in hs_by_pos[pos] ]) for pos in uniq_hs_positions ] reg.add_record('Hotspots & Deleterious', '\n'.join(hs_breakable)) reg.add_record('Possible reasons', ', '.join(r.extra_fields)) reg.add_record('Samples', ',\n'.join(r.sample_name)) found_mutations = [] for sample in samples: if sample.name in r.sample_name: for mut in mutations[sample.name]: if mut.gene.name == r.gene_name and r.start <= mut.pos <= r.end: found_mutations.append( gray( Metric.format_value( mut.pos, human_readable=True, is_html=True)) + ':' + mut.ref + '>' + mut.alt + ' (' + sample.name + ')') reg.add_record('Found mutations', '\n'.join(found_mutations)) flagged_regions_report.expandable = True flagged_regions_report.unique = True regions_dict[coverage_type] = create_section( flagged_regions_report, num_regions, regions_by_gene.keys(), region_type) total_regions += num_regions flagged_report_fpath = join(flagged_regions_report_dirpath, 'flagged_' + region_type + '.html') write_static_html_report(cnf, { 'key_or_target': key_or_target_genes, 'region_type': region_type, 'genes_description': genes_description, 'flagged_low': regions_dict['low'], 'flagged_high': regions_dict['high'], }, flagged_report_fpath, tmpl_fpath=join( dirname(abspath(__file__)), 'template_flagged_regions.html'), extra_js_fpaths=[ join(dirname(abspath(__file__)), 'static', 'flagged_regions.js') ], extra_css_fpaths=[ join(dirname(abspath(__file__)), 'static', 'flagged_regions.css') ]) #BaseReport.save_html(flagged_regions_report, cnf, flagged_report_fpath, caption='Flagged regions') info('') info('Flagged regions (total ' + str(total_regions) + ' ' + region_type + ') saved into:') info(' ' + flagged_report_fpath)
metric_storage = MetricStorage(sections=[ ReportSection( 'basic', '', [ Metric('Total variants', 'Total', 'Total number of passed variants with'), Metric('SNPs', 'SNP', 'SNPs'), Metric('Insertions', 'Ins', 'Insertions'), Metric('Deletions', 'Del', 'Deletions'), Metric('Novel', 'Novel', 'Novel (not in dbSNP or Cosmic'), Metric('Novel, %', '%', '% novel varinats', unit='%'), # Metric('dbsnp_loci', 'Loci in dnSNP', 'Loci in dbSNP (just CHROM:POS matches, regardless if allele is the same)'), # Metric('dbsnp_loci_percent', '%', '% loci in dbSNP (just CHROM:POS matches, regardless if allele is the same)', unit='%'), Metric('In dbSNP', 'dbSNP', 'Variants in dbSNP'), Metric('In dbSNP, %', '%', '% variants in dbSNP', unit='%'), # Metric('cosmic_loci', 'Loci in Cosmic', 'Loci in Cosmic (just CHROM:POS matches, regardless if allele is the same)'), # Metric('cosmic_loci_percent', '%', '% loci in Cosmic (just CHROM:POS matches, regardless if allele is the same)', unit='%'), Metric('In Cosmic', 'Cosmic', 'Variants in Cosmic'), Metric('In Cosmic, %', '%', '% variants in Cosmic', unit='%'), # Metric('bases_per_variant', 'Bp/var', 'Reference bases per variant', quality='Less is better'), Metric('Het/hom', 'Het/hom', 'Heterozygosity to homozygosity ratio' ), Metric( 'Ti/tv', 'Ti/tv', 'Transition (T<->C, A<->G) to transversion (A<->C, C<->G, G<->T, T<->A) ratio. Should be 2 to 3 or higher (depending on the species and region)' ), Metric('Total with rejected', 'Total with rejected', 'Total number of records in VCF, regardless FILTER column'), ]) ])
metric_storage = MetricStorage( general_section=ReportSection('general_section', '', [ Metric('Reference size', short_name='Reference size', common=True), Metric('Regions size/percentage of reference (on target)', short_name='Regions size/percentage of reference', common=True), Metric('Regions size/percentage of reference (on target) %', short_name='Regions size/percentage of reference', common=True), ]), sections=[ # ReportSection('basic_metrics', 'General', [ # Metric('Number of reads', 'Reads', 'Total number of reads'), # Metric('Mapped reads', 'Mapped', 'Number of mapped reads'), # Metric('Mapped reads %', 'Mapped %', 'Number of mapped reads'), # Metric('Unmapped reads', 'Unmapped ', 'Number of unmapped reads', quality='Less is better'), # Metric('Unmapped reads %', 'Unmapped %', 'Number of unmapped reads', quality='Less is better'), # ]), # ReportSection('on_off_metrics', 'ON/OFF target', [ # Metric('Mapped reads, only first in pair', 'Mapped, 1st', 'Number of mapped reads, only first in pair'), # Metric('Mapped reads, only second in pair', 'Mapped, 2nd', 'Number of mapped reads, only second in pair'), # Metric('Mapped reads, both in pair', 'Mapped, both', 'Number of mapped reads, both in pair'), # Metric('Mapped reads, singletons', 'Mapped, single', 'Number of mapped reads, singletons'), # Metric('Mapped reads, only first in pair (on target)', 'Mapped, 1st (on trg)', 'Number of mapped reads inside of regions, only first in pair'), # Metric('Mapped reads, only second in pair (on target)', 'Mapped, 2nd (on trg)', 'Number of mapped reads inside of regions, only second in pair'), # Metric('Mapped reads, both in pair (on target)', 'Mapped, both (on trg)', 'Number of mapped reads inside of regions, both in pair'), # Metric('Mapped reads, singletons (on target)', 'Mapped, single (on trg)', 'Number of mapped reads inside of regions, singletons') # ]), ReportSection('depth_metrics', 'Target coverage depth', [ Metric('Coverage Mean', 'Cov. mean', 'Coverage mean'), Metric('Coverage Mean (on target)', 'Cov. mean (on trg)', 'Coverage mean, inside of regions'), Metric('Coverage Standard Deviation', 'Cov. std. dev.', 'Coverage std. dev.', quality='Less is better'), Metric('Coverage Standard Deviation (on target)', 'Cov. std. dev. (on trg)', 'Coverage std. dev., inside of regions', quality='Less is better') ]), ReportSection('reads', 'Reads', [ Metric( 'Reference size', 'Reference size', ), Metric('Number of reads', 'Reads', 'Total number of reads'), Metric( 'Mapped reads', 'Mapped', ), Metric( 'Mapped reads %', 'Mapped %', ), Metric( 'Unmapped reads', 'Unmapped', ), Metric( 'Unmapped reads %', 'Unmapped %', ), Metric( 'Mapped reads (on target)', 'Mapped (on trg)', ), Metric( 'Mapped reads (on target) %', 'Mapped % (on trg)', ), Metric( 'Mapped paired reads', 'Mapped paired reads', ), Metric( 'Mapped paired reads %', 'Mapped paired reads %', ), Metric( 'Paired reads', 'Paired reads', ), Metric( 'Paired reads %', 'Paired reads %', ), Metric( 'Duplicated reads (flagged)', 'Dup rate', ), Metric( 'Duplicated reads (flagged) %', 'Dup rate %', ), Metric( 'Duplicated reads (flagged) (on target)', 'Dup rate', ), Metric( 'Duplicated reads (flagged) (on target) %', 'Dup rate %', ), Metric('Read min length', 'Min len', 'Read min length'), Metric('Read max length', 'Max len', 'Read max length'), Metric('Read mean length', 'Ave len', 'Read mean length'), ]), ReportSection( 'qualimap', 'Qualimap metrics', [ Metric('Mean Mapping Quality (on target)', 'Mean MQ (on trg)', 'Mean mapping quality, inside of regions'), Metric('Mismatches (on target)', 'Mismatches (on trg)', 'Mismatches, inside of regions', quality='Less is better'), # added in Qualimap v.2.0 Metric('Insertions (on target)', 'Insertions (on trg)', 'Insertions, inside of regions', quality='Less is better'), Metric('Deletions (on target)', 'Deletions (on trg)', 'Deletions, inside of regions', quality='Less is better'), Metric('Homopolymer indels (on target)', 'Homopol indels (on trg)', 'Percentage of homopolymer indels, inside of regions', quality='Less is better'), Metric('Mean Mapping Quality', 'Mean MQ', 'Mean mapping quality, inside of regions'), Metric('Mismatches', 'Mismatches', 'Mismatches, inside of regions', quality='Less is better'), # added in Qualimap v.2.0 Metric('Insertions', 'Insertions', 'Insertions, inside of regions', quality='Less is better'), Metric('Deletions', 'Deletions', 'Deletions, inside of regions', quality='Less is better'), Metric('Homopolymer indels', 'Homopol indels', 'Percentage of homopolymer indels, inside of regions', quality='Less is better'), ]) ])
metric_storage = MetricStorage( general_section=ReportSection(metrics=[ Metric(PRE_FASTQC_NAME), Metric(FASTQC_NAME), Metric(EXAC_NAME), Metric(MUTATIONS_NAME), Metric(MUTATIONS_SINGLE_NAME), Metric(MUTATIONS_PAIRED_NAME), Metric(ABNORMAL_NAME), Metric(GENE_COUNTS_NAME), Metric(EXON_COUNTS_NAME), Metric(GENE_TPM_NAME), Metric(ISOFORM_TPM_NAME), ]), sections=[ ReportSection(metrics=[ Metric(PRE_FASTQC_NAME), Metric(FASTQC_NAME), # Metric('BAM'), # Metric(MUTATIONS_NAME), Metric( GENDER, description= 'If not defined, means that the target does not contain key male Y genes that we could check' ), Metric(CLINICAL_NAME), Metric(CNV_NAME), Metric(GENE_COUNTS_NAME), Metric(PHENOTYPE), Metric(NORM_MATCH), ]) ])