def __mutations_section(self, mutations_report, experiment_by_key): mutations_dict = dict() if self.mutations_report and self.mutations_report.rows: # if cnf.debug: # mutations_report.regions = mutations_report.regions[::20] mutations_dict['table'] = build_report_html(mutations_report, sortable=True) mutations_dict['total_variants'] = ', '.join( Metric.format_value(e.total_variants, is_html=True) + ' ' + k[1] for k, e in experiment_by_key.items()) mutations_dict['total_key_genes'] = '/'.join( set( Metric.format_value(len(e.key_gene_by_name_chrom.values()), is_html=True) + ' ' + k[1] for k, e in experiment_by_key.items())) mutations_dict['experiments'] = [ dict(header=k[1], key=k[1].lower()) for k in self.experiment_by_key.keys() ] mutations_dict['plot_data'] = self.mutations_plot_data mutations_dict[ 'substitutions_plot_data'] = self.substitutions_plot_data return mutations_dict
def make_key_genes_cov_report(experiment_by_key): info('Making key genes coverage report...') ms = [ Metric('Gene'), Metric('Chr', with_heatmap=False, max_width=20, align='right') ] for i, (k, e) in enumerate(experiment_by_key.items()): ms.extend([ Metric(k + ' Ave depth', short_name=k + '\nave depth', med=e.ave_depth, class_='shifted_column' if i == 0 else ''), Metric(k + ' % cov at {}x'.format(e.depth_cutoff), short_name='% at {}x'.format(e.depth_cutoff), unit='%', med=1, low_inner_fence=0.5, low_outer_fence=0.1), Metric(k + ' CNV', short_name=' CNV') ] # short name is hack for IE9 who doesn't have "text-align: left" and tries to stick "CNV" to the previous col header ) clinical_cov_metric_storage = MetricStorage( sections=[ReportSection(metrics=ms)]) key_genes_report = PerRegionSampleReport( sample=experiment_by_key.values()[0].sample, metric_storage=clinical_cov_metric_storage) # Writing records hits_by_gene_by_experiment = OrderedDefaultDict(OrderedDict) for k, e in experiment_by_key.items(): for gene in e.key_gene_by_name.values(): hits_by_gene_by_experiment[gene.name][e] = gene for gname, hit_by_experiment in sorted( hits_by_gene_by_experiment.items(), key=lambda (gname, h): gname): gene = next( (m for m in hit_by_experiment.values() if m is not None), None) row = key_genes_report.add_row() row.add_record('Gene', gene.name) row.add_record('Chr', gene.chrom.replace('chr', '')) for e, hit in hit_by_experiment.items(): row.add_record(e.key + ' Ave depth', hit.ave_depth) m = clinical_cov_metric_storage.find_metric( e.key + ' % cov at {}x'.format(e.depth_cutoff)) row.add_record( m.name, next((cov for cutoff, cov in hit.cov_by_threshs.items() if cutoff == e.depth_cutoff), None)) if hit.seq2c_event and (hit.seq2c_event.is_amp() or hit.seq2c_event.is_del()): row.add_record( e.key + ' CNV', hit.seq2c_event.amp_del + ', ' + hit.seq2c_event.fragment) return key_genes_report
def _prep_comb_report(metric_storage, samples, shared_general_metrics, shared_metrics): comb_general_metrics = shared_general_metrics[:] comb_general_metrics.append(Metric('For each sample')) for s in samples: comb_general_metrics.append(Metric(s.name + ' ave depth')) comb_metrics = shared_metrics[:] for s in samples: comb_metrics.append( DepthsMetric(s.name + ' hotspots depths/norm depths', short_name=s.name)) comb_report_metric_storage = MetricStorage( general_section=ReportSection('general_section', metrics=comb_general_metrics), sections=[ReportSection(metrics=comb_metrics)]) report = PerRegionSampleReport(sample='Combined', metric_storage=comb_report_metric_storage) report.add_record( 'Sample', 'contains values from all samples: ' + ', '.join([s.name for s in samples])) report.add_record('For each sample', 'Depths and normalized depths for each hotspot.') m = metric_storage.find_metric('Average sample depth') for s in samples: val = BaseReport.find_record(s.report.records, m.name).value report.add_record(s.name + ' ave depth', val) return report
def create_section(report, num_regions, genes, region_type): flagged_dict = dict() if report.rows: # if cnf.debug: # mutations_report.regions = mutations_report.regions[::20] flagged_dict['table'] = build_report_html(report, sortable=True) flagged_dict['total_regions'] = Metric.format_value(num_regions, is_html=True) flagged_dict['total_key_genes'] = Metric.format_value(len(genes), is_html=True) flagged_dict['region_type'] = Metric.format_value(region_type, is_html=True) return flagged_dict
def find_other_occurences(row, mut_by_experiment, cur_group_num, samples_data, parameters_info): num_by_samples = defaultdict(set) tooltips = [] if cur_group_num: for e, m in mut_by_experiment.items(): if get_group_num(e.key) == cur_group_num: continue sample_parameters = get_sample_info(e.sample.name, e.sample.dirpath, samples_data) sample_parameters = remove_parameters_to_combine(sample_parameters) short_parameters = [ parameters_info.items()[i][1].prefixes[p.lower()] for i, p in enumerate(sample_parameters) ] num_by_samples[tuple(short_parameters)].add(get_group_num(e.key)) report_link = '<a href="' + basename( e.sample.clinical_html ) + '" target="_blank">' + e.sample.name + '</a>' freq = Metric.format_value(m.freq, is_html=True, unit='%') tooltip = report_link + ': ' + str(freq) + ' ' + str( m.depth) + '<br>' tooltips.append((e.sample.name, tooltip)) tooltips = [tooltip[1] for tooltip in sorted(tooltips)] other_occurences = ', '.join( [str(len(v)) + ''.join(k) for k, v in num_by_samples.iteritems()]) other_occurences = add_tooltip(other_occurences, ''.join(tooltips)) row.add_record('Other occurrences', other_occurences) return row
def format(self, value, human_readable=True): variants = value fmt_pos = lambda pos: Metric.format_value(int(pos), human_readable=True) return ' '.join( '{pos}:{var.ref}/{var.alt}'.format(pos=fmt_pos(var.pos), var=var) for var in variants)
def format(self, value, human_readable=True): depth_tuples = value fmt = lambda dp: Metric.format_value(dp, human_readable=True) return ' '.join('{depth}/{norm_depth}'.format( depth=fmt(int(depth) if depth is not None else None), norm_depth=fmt( float(norm_depth) if norm_depth is not None else None)) for (depth, norm_depth) in depth_tuples)
def make_expression_heatmap(bcbio_structure, gene_counts): samples_names = [sample.name for sample in bcbio_structure.samples] metrics = [Metric('Gene')] + [ Metric(sample_name, max_width=40) for sample_name in samples_names ] metric_storage = MetricStorage( sections=[ReportSection(metrics=metrics, name='samples')]) report = PerRegionSampleReport(metric_storage=metric_storage, expandable=True, unique=True, heatmap_by_rows=True, keep_order=True, large_table=True, vertical_sample_names=True) printed_genes = set() # Writing records for gene in sorted(gene_counts.keys()): first_record = gene_counts[gene][0] if first_record.is_hidden_row: printed_genes.add(first_record.gene_name) row = report.add_row() row.add_record('Gene', first_record.gene_name) for sample in samples_names: row.add_record( sample, sum([ record.counts[sample] for record in gene_counts[gene] ])) row.class_ = ' expandable_gene_row collapsed' for record in gene_counts[gene]: gene_expression = record.counts row = report.add_row() if record.is_hidden_row: row_class = ' row_to_hide row_hidden' else: row_class = ' expandable_gene_row collapsed' row.add_record('Gene', record.name) for sample, count in gene_expression.iteritems(): if sample in samples_names: row.add_record(sample, count) row.class_ = row_class return report
def _mutations_records(general_section, bcbio_structure, base_dirpath): records = [] caller = bcbio_structure.variant_callers.get('vardict') or \ bcbio_structure.variant_callers.get('vardict-java') _base_mut_fname = variant_filtering.mut_fname_template.format( caller_name=caller.name) _base_mut_fpath = join(bcbio_structure.date_dirpath, _base_mut_fname) mut_fpath = add_suffix(_base_mut_fpath, variant_filtering.mut_pass_suffix) single_mut_fpath = add_suffix( add_suffix(_base_mut_fpath, variant_filtering.mut_single_suffix), variant_filtering.mut_pass_suffix) paired_mut_fpath = add_suffix( add_suffix(_base_mut_fpath, variant_filtering.mut_paired_suffix), variant_filtering.mut_pass_suffix) mut_fpath = verify_file(mut_fpath, silent=True) single_mut_fpath = verify_file(single_mut_fpath, silent=True) paired_mut_fpath = verify_file(paired_mut_fpath, silent=True) for fpath, metric_name in ((mut_fpath, MUTATIONS_NAME), (single_mut_fpath, MUTATIONS_SINGLE_NAME), (paired_mut_fpath, MUTATIONS_PAIRED_NAME)): if fpath: metric = Metric(metric_name, common=True) rec = Record(metric=metric, value=basename(fpath), url=relpath(fpath, base_dirpath)) general_section.add_metric(metric) records.append(rec) if bcbio_structure.seq2c_fpath and isfile(bcbio_structure.seq2c_fpath): metric = Metric(CNV_NAME, common=True) fpath = bcbio_structure.seq2c_fpath rec = Record(metric=metric, value=basename(fpath), url=relpath(fpath, base_dirpath)) general_section.add_metric(metric) records.append(rec) return records
def __repr__(self): fmt_pos = lambda pos: Metric.format_value(pos, human_readable=True) return '{pos} {var.ref}/{var.alt} {var.cls[0]}'.format(pos=fmt_pos( int(self.pos)), var=self)
for var in variants) class DepthsMetric(Metric): def format(self, value, human_readable=True): depth_tuples = value fmt = lambda dp: Metric.format_value(dp, human_readable=True) return ' '.join('{depth}/{norm_depth}'.format( depth=fmt(int(depth) if depth is not None else None), norm_depth=fmt( float(norm_depth) if norm_depth is not None else None)) for (depth, norm_depth) in depth_tuples) shared_metrics = [ Metric('Gene'), Metric('Chr'), Metric('Start'), Metric('End'), Metric('Strand'), Metric('Feature'), Metric('Biotype'), Metric('ID'), Metric('Hotspots num', short_name='#HS'), VariantsMetric('Hotspots list', short_name='Hotspots') ] shared_general_metrics = [Metric('Sample', short_name='Sample', common=True)] single_report_metric_storage = MetricStorage( general_section=ReportSection(
def _generate_summary_flagged_regions_report(cnf, bcbio_structure, samples, mutations, key_or_target_genes): region_types = ['exons', 'target'] coverage_types = ['low', 'high'] flagged_regions_metrics = [ Metric('Gene', min_width=50, max_width=70), Metric('Chr', with_heatmap=False, max_width=20, align='right'), Metric('Position', td_class='td_position', min_width=70, max_width=120), Metric('Ave depth', td_class='long_expanded_line right_aligned', max_width=100, with_heatmap=False), Metric('#HS', quality='Less is better', align='right', max_width=30), Metric('Hotspots & Deleterious', td_class='long_expanded_line', min_width=100, max_width=150), Metric('Found mutations', td_class='long_expanded_line', min_width=150, max_width=200), Metric('Samples', td_class='long_expanded_line', min_width=100, max_width=120), Metric('Possible reasons', td_class='long_expanded_line', max_width=120) ] flagged_regions_metric_storage = MetricStorage( sections=[ReportSection(metrics=flagged_regions_metrics)]) flagged_regions_report_dirpath = bcbio_structure.flagged_regions_dirpath safe_mkdir(flagged_regions_report_dirpath) if key_or_target_genes == 'target': genes_description = 'genes' else: genes_description = 'genes that have been previously implicated in various cancers' for region_type in region_types: regions_dict = {} total_regions = 0 info() info('Preparing report for ' + region_type) for coverage_type in coverage_types: regions_by_gene = {} for sample in samples: selected_regions_bed_fpath = join( sample.flagged_regions_dirpath, coverage_type + '_cov_' + region_type + '.bed') regions_by_reasons = {} if verify_file(selected_regions_bed_fpath, is_critical=False): intersection_fpath = _intersect_with_tricky_regions( cnf, selected_regions_bed_fpath, sample.name) regions_by_reasons = _parse_intersection_with_tricky_regions( cnf, intersection_fpath) total_report_fpath = add_suffix( add_suffix(sample.flagged_tsv, region_type), coverage_type) if verify_file(total_report_fpath, is_critical=False): with open(total_report_fpath) as f: for l in f: l = l.strip() if not l or l.startswith('#'): continue fs = l.split('\t') (chrom, start, end, size, gene, strand, feature, biotype, min_depth, avg_depth) = fs[:10] start, end = int(start), int(end) regions_by_gene.setdefault(gene, []) cur_region = Region(sample_name=[sample.name], avg_depth=[avg_depth], gene_name=gene, strand=strand, feature=feature, biotype=biotype, chrom=chrom, start=start, end=end) for r in regions_by_reasons: if r[0] <= start and end <= r[1]: cur_region.extra_fields = regions_by_reasons[ r] cur_region.missed_by_db = [] was_added = False for r in regions_by_gene[gene]: if r.start <= cur_region.start <= r.end and r.start <= cur_region.end <= r.end: was_added = True if sample.name not in r.sample_name: r.sample_name.append(sample.name) r.avg_depth.append(avg_depth) if not was_added: regions_by_gene[gene].append(cur_region) report_fpath = join( sample.flagged_regions_dirpath, coverage_type + '_cov_' + region_type + '.oncomine.tsv') if verify_file(report_fpath, is_critical=False): with open(report_fpath) as f: for l in f: l = l.strip() if not l or l.startswith('#'): continue fs = l.split('\t') hotspots = [] (gene, chrom, start, end, strand, feature, biotype, id_, num_hotspots) = fs[:9] start, end = int(start), int(end) if int(num_hotspots) != 0: hotspots = fs[9].split() regions_by_gene.setdefault(gene, []) cur_region = Region(sample_name=[sample.name], gene_name=gene, strand=strand, feature=feature, biotype=biotype, chrom=chrom, start=start, end=end) for r in regions_by_gene[gene]: if r.start <= cur_region.start <= r.end and r.start <= cur_region.end <= r.end: if sample.name not in r.sample_name: r.sample_name.append(sample.name) r.avg_depth.append('.') new_hotspots = [ hs for hs in hotspots if hs not in r.missed_by_db ] r.missed_by_db.extend(new_hotspots) flagged_regions_report = PerRegionSampleReport( name='Flagged regions', metric_storage=flagged_regions_metric_storage) num_regions = 0 non_hs_class = ' no_hotspots' slash_with_zero_space = '/​' for gene in regions_by_gene.keys(): if regions_by_gene[gene]: num_regions += len(regions_by_gene[gene]) row_class = ' expandable_row collapsed' if len(regions_by_gene[gene]) > 1: reg = flagged_regions_report.add_row() reg.class_ = ' expandable_gene_row collapsed' chr = regions_by_gene[gene][0].chrom num_hotspots = [ len(r.missed_by_db) for r in regions_by_gene[gene] ] all_samples = [ sample for r in regions_by_gene[gene] for sample in r.sample_name ] all_unique_samples = [] all_unique_samples = [ sample for sample in all_samples if sample not in all_unique_samples and not all_unique_samples.append(sample) ] all_tricky_regions = sorted( set([ tricky_region for r in regions_by_gene[gene] for tricky_region in r.extra_fields ])) all_depths = [[] for x in range(len(all_unique_samples))] for r in regions_by_gene[gene]: for sample_num, sample in enumerate( all_unique_samples): if sample in r.sample_name: cur_sample_index = r.sample_name.index( sample) if r.avg_depth[cur_sample_index] != '.': all_depths[sample_num].append( float( r.avg_depth[cur_sample_index])) avg_depth_per_samples = [ sum(all_depths[i]) / len(all_depths[i]) if len(all_depths[i]) > 0 else 0 for i in range(len(all_depths)) ] reg.add_record('Gene', gene) reg.add_record('Chr', chr.replace('chr', '')) reg.add_record('#HS', sum(num_hotspots)) reg.add_record( 'Position', str(len(regions_by_gene[gene])) + ' regions') reg.add_record( 'Ave depth', slash_with_zero_space.join([ format(depth, '.2f') if depth != '.' else '.' for depth in avg_depth_per_samples ]), num=sum(avg_depth_per_samples) / len(avg_depth_per_samples)) reg.add_record('Hotspots & Deleterious', '') reg.add_record('Possible reasons', ', '.join(all_tricky_regions)) reg.add_record('Samples', ',\n'.join(all_unique_samples)) reg.add_record('Found mutations', '') if sum(num_hotspots) == 0: reg.class_ += non_hs_class row_class += ' row_to_hide row_hidden' else: row_class += ' not_to_hide' for r in regions_by_gene[gene]: reg = flagged_regions_report.add_row() reg.class_ = row_class reg.add_record('Gene', r.gene_name) reg.add_record('Chr', r.chrom.replace('chr', '')) avg_depths = [ float(depth) for depth in r.avg_depth if depth != '.' ] reg.add_record( 'Ave depth', slash_with_zero_space.join([ format(depth, '.2f') if depth != '.' else depth for depth in avg_depths ]), num=sum(avg_depths) / len(avg_depths)) reg.add_record( 'Position', Metric.format_value( r.start, human_readable=True, is_html=True) + '-' + Metric.format_value( r.end, human_readable=True, is_html=True)) reg.add_record('#HS', len(r.missed_by_db)) if len(r.missed_by_db) == 0: reg.class_ += non_hs_class uniq_hs_positions = sorted( set([ hotspot.split(':')[0] for hotspot in r.missed_by_db ])) hs_by_pos = { pos: [ h.split(':')[1] for h in r.missed_by_db if h.split(':')[0] == pos ] for pos in uniq_hs_positions } hs_breakable = [ gray( Metric.format_value(int(pos.replace(',', '')), human_readable=True, is_html=True)) + ': ' + ','.join([ h.replace('/', slash_with_zero_space) for h in hs_by_pos[pos] ]) for pos in uniq_hs_positions ] reg.add_record('Hotspots & Deleterious', '\n'.join(hs_breakable)) reg.add_record('Possible reasons', ', '.join(r.extra_fields)) reg.add_record('Samples', ',\n'.join(r.sample_name)) found_mutations = [] for sample in samples: if sample.name in r.sample_name: for mut in mutations[sample.name]: if mut.gene.name == r.gene_name and r.start <= mut.pos <= r.end: found_mutations.append( gray( Metric.format_value( mut.pos, human_readable=True, is_html=True)) + ':' + mut.ref + '>' + mut.alt + ' (' + sample.name + ')') reg.add_record('Found mutations', '\n'.join(found_mutations)) flagged_regions_report.expandable = True flagged_regions_report.unique = True regions_dict[coverage_type] = create_section( flagged_regions_report, num_regions, regions_by_gene.keys(), region_type) total_regions += num_regions flagged_report_fpath = join(flagged_regions_report_dirpath, 'flagged_' + region_type + '.html') write_static_html_report(cnf, { 'key_or_target': key_or_target_genes, 'region_type': region_type, 'genes_description': genes_description, 'flagged_low': regions_dict['low'], 'flagged_high': regions_dict['high'], }, flagged_report_fpath, tmpl_fpath=join( dirname(abspath(__file__)), 'template_flagged_regions.html'), extra_js_fpaths=[ join(dirname(abspath(__file__)), 'static', 'flagged_regions.js') ], extra_css_fpaths=[ join(dirname(abspath(__file__)), 'static', 'flagged_regions.css') ]) #BaseReport.save_html(flagged_regions_report, cnf, flagged_report_fpath, caption='Flagged regions') info('') info('Flagged regions (total ' + str(total_regions) + ' ' + region_type + ') saved into:') info(' ' + flagged_report_fpath)
from source.variants import vcf_parser import source from source.logger import step_greetings, warn from source.file_utils import open_gzipsafe from source.reporting.reporting import Metric, MetricStorage, ReportSection, SampleReport from source.utils import get_db_path from source.variants.vcf_processing import get_sample_column_index import source.variants.vcf_processing as vcf_processing metric_storage = MetricStorage(sections=[ ReportSection( 'basic', '', [ Metric('Total variants', 'Total', 'Total number of passed variants with'), Metric('SNPs', 'SNP', 'SNPs'), Metric('Insertions', 'Ins', 'Insertions'), Metric('Deletions', 'Del', 'Deletions'), Metric('Novel', 'Novel', 'Novel (not in dbSNP or Cosmic'), Metric('Novel, %', '%', '% novel varinats', unit='%'), # Metric('dbsnp_loci', 'Loci in dnSNP', 'Loci in dbSNP (just CHROM:POS matches, regardless if allele is the same)'), # Metric('dbsnp_loci_percent', '%', '% loci in dbSNP (just CHROM:POS matches, regardless if allele is the same)', unit='%'), Metric('In dbSNP', 'dbSNP', 'Variants in dbSNP'), Metric('In dbSNP, %', '%', '% variants in dbSNP', unit='%'), # Metric('cosmic_loci', 'Loci in Cosmic', 'Loci in Cosmic (just CHROM:POS matches, regardless if allele is the same)'), # Metric('cosmic_loci_percent', '%', '% loci in Cosmic (just CHROM:POS matches, regardless if allele is the same)', unit='%'), Metric('In Cosmic', 'Cosmic', 'Variants in Cosmic'), Metric('In Cosmic, %', '%', '% variants in Cosmic', unit='%'), # Metric('bases_per_variant', 'Bp/var', 'Reference bases per variant', quality='Less is better'), Metric('Het/hom', 'Het/hom', 'Heterozygosity to homozygosity ratio'
from source.reporting.reporting import Metric, Record, MetricStorage, ReportSection metric_storage = MetricStorage( general_section=ReportSection('general_section', '', [ Metric('Reference size', short_name='Reference size', common=True), Metric('Regions size/percentage of reference (on target)', short_name='Regions size/percentage of reference', common=True), Metric('Regions size/percentage of reference (on target) %', short_name='Regions size/percentage of reference', common=True), ]), sections=[ # ReportSection('basic_metrics', 'General', [ # Metric('Number of reads', 'Reads', 'Total number of reads'), # Metric('Mapped reads', 'Mapped', 'Number of mapped reads'), # Metric('Mapped reads %', 'Mapped %', 'Number of mapped reads'), # Metric('Unmapped reads', 'Unmapped ', 'Number of unmapped reads', quality='Less is better'), # Metric('Unmapped reads %', 'Unmapped %', 'Number of unmapped reads', quality='Less is better'), # ]), # ReportSection('on_off_metrics', 'ON/OFF target', [ # Metric('Mapped reads, only first in pair', 'Mapped, 1st', 'Number of mapped reads, only first in pair'), # Metric('Mapped reads, only second in pair', 'Mapped, 2nd', 'Number of mapped reads, only second in pair'), # Metric('Mapped reads, both in pair', 'Mapped, both', 'Number of mapped reads, both in pair'), # Metric('Mapped reads, singletons', 'Mapped, single', 'Number of mapped reads, singletons'), # Metric('Mapped reads, only first in pair (on target)', 'Mapped, 1st (on trg)', 'Number of mapped reads inside of regions, only first in pair'), # Metric('Mapped reads, only second in pair (on target)', 'Mapped, 2nd (on trg)', 'Number of mapped reads inside of regions, only second in pair'), # Metric('Mapped reads, both in pair (on target)', 'Mapped, both (on trg)', 'Number of mapped reads inside of regions, both in pair'), # Metric('Mapped reads, singletons (on target)', 'Mapped, single (on trg)', 'Number of mapped reads inside of regions, singletons')
GENE_TPM_NAME = 'Gene TPM' ISOFORM_TPM_NAME = 'Isoform TPM' mutation_names = [ MUTATIONS_NAME, MUTATIONS_SINGLE_NAME, MUTATIONS_PAIRED_NAME, CNV_NAME ] expression_names = [ GENE_COUNTS_NAME, EXON_COUNTS_NAME, GENE_TPM_NAME, ISOFORM_TPM_NAME, ] metric_storage = MetricStorage( general_section=ReportSection(metrics=[ Metric(PRE_FASTQC_NAME), Metric(FASTQC_NAME), Metric(EXAC_NAME), Metric(MUTATIONS_NAME), Metric(MUTATIONS_SINGLE_NAME), Metric(MUTATIONS_PAIRED_NAME), Metric(ABNORMAL_NAME), Metric(GENE_COUNTS_NAME), Metric(EXON_COUNTS_NAME), Metric(GENE_TPM_NAME), Metric(ISOFORM_TPM_NAME), ]), sections=[ ReportSection(metrics=[ Metric(PRE_FASTQC_NAME), Metric(FASTQC_NAME),