def main():
    info(' '.join(sys.argv))
    info()
    cnf, bcbio_structure = bcbio_summary_script_proc_params(
            'expression', BCBioStructure.expression_dir)

    step_greetings('Gene expression heatmaps summary for all samples')
    report_caption_names = ['Gene counts', 'Exon counts', 'Gene TPM', 'Isoform TPM']
    genes_dict, transcripts_dict = _get_gene_transcripts_id(cnf)
    for counts_fname, report_caption_name in zip(bcbio_structure.counts_names, report_caption_names):
        counts_fpath = join(bcbio_structure.expression_dirpath, counts_fname)
        if not verify_file(counts_fpath, silent=True):
            raw_counts_fpath = join(bcbio_structure.expression_dirpath, 'raw', 'combined.' + counts_fname.replace('.tsv', ''))
            info('Annotating ' + report_caption_name + ' from ' + raw_counts_fpath)
            annotate_gene_counts(cnf, raw_counts_fpath, counts_fpath, genes_dict)
        verify_file(counts_fpath, is_critical=True, description=counts_fname)

        isoforms_found = counts_fname == 'isoform.sf.tpm' and counts_fpath
        used_dict = transcripts_dict if isoforms_found else genes_dict
        report_fpath = join(safe_mkdir(join(bcbio_structure.expression_dirpath, 'html')),
                            counts_fname.replace('.tsv', '') + '.html')

        make_gene_expression_heatmaps(cnf, bcbio_structure, counts_fpath, used_dict, report_fpath,
                                      report_caption_name, keep_gene_names=isoforms_found)
    info('Done')
예제 #2
0
def _rename_fields(cnf, inp_tsv_fpath, field_map):
    if cnf.get('keep_intermediate'):
        step_greetings('Renaming fields.')

    with open(inp_tsv_fpath) as f:
        first_line = f.readline()
    fields = first_line.split()
    new_fields = [field_map.get(f) or f for f in fields]
    new_first_line = '\t'.join(new_fields)

    if cnf.get('keep_intermediate'):
        out_tsv_fpath = intermediate_fname(cnf, inp_tsv_fpath, 'renamed')
    else:
        out_tsv_fpath = inp_tsv_fpath

    with file_transaction(cnf.work_dir, out_tsv_fpath) as tx_out_fpath:
        with open(tx_out_fpath, 'w') as out:
            out.write(new_first_line + '\n')
            with open(inp_tsv_fpath) as f:
                for i, l in enumerate(f):
                    if i >= 1:
                        out.write(l)

    if not cnf.get('keep_intermediate'):
        shutil.move(out_tsv_fpath, inp_tsv_fpath)
        return inp_tsv_fpath
    else:
        return out_tsv_fpath
예제 #3
0
def _tracks(cnf, track_fpath, input_fpath):
    if not verify_file(track_fpath):
        return None

    field_name = splitext_plus(basename(track_fpath))[0]

    step_greetings('Intersecting with ' + field_name)

    output_fpath = intermediate_fname(cnf, input_fpath, field_name)
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    toolpath = get_system_path(cnf, 'vcfannotate')
    if not toolpath:
        err('WARNING: Skipping annotation with tracks: vcfannotate '
            'executable not found, you probably need to specify path in system_config, or '
            'run load bcbio:  . /group/ngs/bin/bcbio-prod.sh"')
        return None

    # self.all_fields.append(field_name)

    cmdline = '{toolpath} -b {track_fpath} -k {field_name} {input_fpath}'.format(
        **locals())

    assert input_fpath
    output_fpath = call_subprocess(cnf,
                                   cmdline,
                                   input_fpath,
                                   output_fpath,
                                   stdout_to_outputfile=True,
                                   overwrite=True)
    if not verify_vcf(output_fpath):
        err('Error: tracks resulted ' + str(output_fpath) + ' for ' +
            track_fpath)
        return output_fpath

    # Set TRUE or FALSE for tracks
    def proc_line(line, i):
        if field_name in line:
            if not line.startswith('#'):
                fields = line.split('\t')
                info_line = fields[7]
                info_pairs = [attr.split('=') for attr in info_line.split(';')]
                info_pairs = [[pair[0], ('TRUE' if pair[1] else 'FALSE')] if
                              pair[0] == field_name and len(pair) > 1 else pair
                              for pair in info_pairs]
                info_line = ';'.join(
                    '='.join(pair) if len(pair) == 2 else pair[0]
                    for pair in info_pairs)
                fields = fields[:7] + [info_line] + fields[8:]
                return '\t'.join(fields)
        return line

    assert output_fpath
    output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='trk')
    return verify_vcf(output_fpath, is_critical=True)
예제 #4
0
def process_one(cnf):
    sample = VarSample(cnf.sample, cnf.output_dir, vcf=cnf.vcf, bam=cnf.bam, genome=cnf.genome)

    step_greetings('Fixing "SAMPLE" INFO annotation and SAMPLE header...')
    vcf_fpath = fix_vcf_sample_name(cnf, sample.name, cnf.vcf)

    # this method will also gunzip the vcf file
    # sample.vcf = fix_chromosome_names(cnf, sample.vcf)
    # if cnf.vcf.endswith('.gz'):
    #     vcf_fpath = intermediate_fname(cnf, splitext(sample.vcf)[0], None)
    #     info('Ungzipping ' + sample.vcf + ', writing to ' + vcf_fpath)
    #     gunzip = get_system_path(cnf, 'gunzip', is_critical=True)
    #     cmdl = '{gunzip} {sample.vcf} --to-stdout'.format(**locals())
    #     call(cnf, cmdl, output_fpath=vcf_fpath)
    #     verify_vcf(vcf_fpath)
    #     sample.vcf = vcf_fpath

    step_greetings('Removing rejeted records...')
    pass_vcf_fpath = remove_rejected(cnf, vcf_fpath)
    info()

    # if sample.vcf is None:
    #     err('No variants left for ' + cnf.vcf + ': all rejected and removed.')
    #     return None, None, None

    # # In mutect, running paired analysis on a single sample could lead
    # # to a "none" sample column. Removing that column.
    # info('get_sample_column_index')
    # none_idx = get_sample_column_index(sample.vcf, 'none', suppress_warn=True)
    # if none_idx is not None:
    #     info('Removing the "none" column.')
    #     def fn(line, i):
    #         if line and not line.startswith('##'):
    #             ts = line.split('\t')
    #             del ts[9 + none_idx]
    #             return '\t'.join(ts) + '\n'
    #         return line
    #     sample.vcf = iterate_file(cnf, sample.vcf, fn, suffix='none_col')

    # Replacing so the main sample goes first (if it is not already)
    # main_idx = get_sample_column_index(sample.vcf, sample.name)
    # if main_idx:
    #     info('Moving the main sample column (' + sample.name + ') to the first place.')
    #     def fn(line, i):
    #         if line and not line.startswith('##'):
    #             ts = line.split('\t')
    #             main_sample_field = ts[9 + main_idx]
    #             del ts[9 + main_idx]
    #             ts = ts[:9] + [main_sample_field] + ts[9:]
    #             return '\t'.join(ts) + '\n'
    #         return line
    #     sample.vcf = iterate_file(cnf, sample.vcf, fn, suffix='main_col')

    anno_vcf_fpath = run_annotators(cnf, pass_vcf_fpath, sample.bam)

    return finialize_annotate_file(cnf, anno_vcf_fpath, sample, cnf.caller)
예제 #5
0
def make_tsv(cnf, vcf_fpath, samplename, main_sample_index=None):
    step_greetings('Exporting to TSV...')

    vcf_fpath = vcf_one_per_line(cnf, vcf_fpath)

    if main_sample_index is None:
        main_sample_index = get_sample_column_index(vcf_fpath, samplename) or 0

    tsv_fpath = _extract_fields(cnf, vcf_fpath, samplename, main_sample_index)
    if not tsv_fpath:
        return tsv_fpath

    return tsv_fpath
예제 #6
0
def run_seq2c(cnf, output_dirpath, samples, seq2c_bed, is_wgs):
    step_greetings('Running Seq2C')

    bams_by_sample = dict()
    for s in samples:
        if not s.bam:
            err('No BAM file for ' + s.name)
            continue
        bams_by_sample[s.name] = s.bam
        # cnf.work_dir = join(ori_work_dir, source.targqc_name + '_' + s.name)
        # safe_mkdir(cnf.work_dir)
        # s.dedup_bam = intermediate_fname(cnf, s.bam, source.dedup_bam)
        # dedupped_bam_by_sample[s.name] = s.dedup_bam
        # if verify_bam(s.dedup_bam, silent=True):
        #     info(s.dedup_bam + ' exists')
        # else:
        #     info('Deduplicating bam file ' + s.dedup_bam)
        #     dedup_jobs.append(remove_dups(cnf, s.bam, s.dedup_bam, use_grid=True))

    # cnf.work_dir = ori_work_dir
    # wait_for_jobs(cnf, dedup_jobs)
    #
    # ok = True
    # for s in samples:
    #     if not dedupped_bam_by_sample.get(s.name) or not verify_bam(dedupped_bam_by_sample[s.name]):
    #         err('No BAM file for ' + s.name)
    #         ok = False
    # if not ok:
    #     err('No BAM files found for any sample, cannot run Seq2C.')
    #     return None

    info('Getting reads and cov stats')
    mapped_read_fpath = join(output_dirpath, 'mapped_reads_by_sample.tsv')
    mapped_read_fpath, samples = __get_mapped_reads(cnf, samples, bams_by_sample, mapped_read_fpath)
    info()
    if not mapped_read_fpath:
        return None

    combined_gene_depths_fpath = join(output_dirpath, 'cov.tsv')
    combined_gene_depths_fpath = __seq2c_coverage(cnf, samples, bams_by_sample, seq2c_bed, is_wgs, combined_gene_depths_fpath)
    info()
    if not combined_gene_depths_fpath:
        return None

    seq2c_report_fpath = join(output_dirpath, source.seq2c_name + '.tsv')
    seq2c_report_fpath = __final_seq2c_scripts(cnf, mapped_read_fpath, combined_gene_depths_fpath, seq2c_report_fpath)
    if not seq2c_report_fpath:
        return None

    info('Done. The results is ' + seq2c_report_fpath)
    return seq2c_report_fpath
예제 #7
0
def _filter_malformed_fields(cnf, input_fpath):
    step_greetings('Correcting malformed fields...')

    def proc_rec(rec):
        for k, v in rec.INFO.items():
            if isinstance(v, list):
                if v[-1] == '.':
                    rec.INFO[k] = rec.INFO[k][:-1]
                if v[0] == '.':
                    rec.INFO[k] = rec.INFO[k][1:]
        return rec

    def proc_line(line, i):
        if line.startswith('#'):
            return line.replace("\' \">", "\'\">")  # For vcf-merge
        return line

        # else:
        # if ',.' in line or '.,' in line:
        #     fields = line.split('\t')
        #     info_line = fields[7]
        #     info_pairs = [attr.split('=') for attr in info_line.split(';')]
        #     new_info_pairs = []
        #     for p in info_pairs:
        #         if len(p) == 2:
        #             if p[1].endswith(',.'):
        #                 p[1] = p[1][:-2]
        #             if p[1].startswith('.,'):
        #                 p[1] = p[1][2:]
        #             new_info_pairs.append('='.join(p))
        #     info_line = ';'.join(new_info_pairs)
        #     fields = fields[:7] + [info_line] + fields[8:]
        #     return '\t'.join(fields)

    info('Correcting INFO fields...')
    output_fpath = iterate_vcf(cnf, input_fpath, proc_rec, suffix='corr')
    info('')
    info('Correcting headers for vcf-merge...')
    output_fpath = iterate_file(cnf,
                                output_fpath,
                                proc_line,
                                suffix='corr_headr')

    return output_fpath
예제 #8
0
def main():
    info(' '.join(sys.argv))
    info()

    cnf, bcbio_structure = bcbio_summary_script_proc_params(
        BCBioStructure.fastqc_name, BCBioStructure.fastqc_dir)

    step_greetings('FastQC summary for all samples')

    final_summary_report_fpath = join(cnf.output_dir,
                                      source.fastqc_name + '.html')

    write_fastqc_combo_report(cnf, final_summary_report_fpath,
                              bcbio_structure.samples)

    info()
    info('*' * 70)
    info('Fastqc summary:')
    info('  ' + final_summary_report_fpath)
def fix_chromosome_names(cnf, vcf_fpath):
    with open(vcf_fpath) as f:
        for l in f:
            if not l.startswith('#'):
                if l.startswith('chr'):
                    info('Chomosome names are hg19, no need to fix.')
                    return vcf_fpath

    step_greetings('Fixing chromosome names')

    def _proc_rec(rec):
        if not rec.CHROM.startswith('chr'):
            rec.CHROM = 'chr' + rec.CHROM
        return rec

    out_fpath = iterate_vcf(cnf, vcf_fpath, _proc_rec, 'chr')

    if not verify_file(out_fpath):
        err('Could not run fix_chromosome_names')

    return out_fpath
예제 #10
0
def add_annotation(cnf, input_fpath, key, value, number, type_, description):
    step_greetings('Adding annotation...')

    def proc_rec(rec):
        rec.INFO[key] = value
        return rec

    output_fpath = iterate_vcf(cnf, input_fpath, proc_rec)

    info('Adding header meta info...')

    def _add_format_header(l, i):
        if l.startswith('#CHROM'):
            ext_l = ''
            ext_l += '##INFO=<ID={key},Number={number},Type={type_},Description="{desc}">\n'.format(
                key=key, number=number, type_=type_, desc=description)
            return ext_l + l
        return l

    output_fpath = iterate_file(cnf, output_fpath, _add_format_header)
    return verify_vcf(output_fpath, is_critical=True)
예제 #11
0
def draw_plots(cnf, vcf_fpath):
    step_greetings('Quality control plots')

    chr_lengths = get_chr_lengths(cnf)
    qc_cnf = cnf['quality_control']
    variants_per_kbp = qc_cnf.get('variant_distribution_scale')
    plot_scale = 1000 * variants_per_kbp

    info()
    info('Subsitutions and indel stats...')
    variants_distribution, substituitions, indel_lengths = _get_subs_and_indel_stats(vcf_fpath, chr_lengths, plot_scale)
    substs_plot_fpath = _draw_substitutions(cnf, substituitions)
    indels_plot_fpath = _draw_indel_lengths(cnf, indel_lengths)
    if substs_plot_fpath:
        info('  Substitutions: ' + substs_plot_fpath)
    if indels_plot_fpath:
        info('  Indels:        ' + indels_plot_fpath)
    variants_distribution_plot_fpath = _draw_variants_distribution(cnf, variants_distribution, chr_lengths,
                                                                   variants_per_kbp)
    if variants_distribution_plot_fpath:
        info('  Variant distr: ' + variants_distribution_plot_fpath)
    return [x for x in [variants_distribution_plot_fpath, substs_plot_fpath, indels_plot_fpath] if x is not None]
예제 #12
0
def run_seq2c_bcbio_structure(cnf, bcbio_structure):
    step_greetings('Coverage statistics for each gene for all samples')

    if cnf.prep_bed is not False:
        info('Preparing BED files')
        features_bed_fpath = cnf.features or cnf.genome.features  # only for annotation
        if cnf.bed or bcbio_structure.bed:
            _, _, _, seq2c_bed = \
                prepare_beds(cnf, features_bed=features_bed_fpath,
                    target_bed=bcbio_structure.bed, seq2c_bed=bcbio_structure.sv_bed)
        else:
            seq2c_bed = verify_bed(cnf.genome.cds)
    else:
        seq2c_bed = verify_bed(cnf.bed)

    info('Calculating normalized coverages for CNV...')
    cnv_report_fpath = run_seq2c(
        cnf, join(bcbio_structure.date_dirpath, BCBioStructure.cnv_dir),
        bcbio_structure.samples, seq2c_bed, is_wgs=cnf.is_wgs)

    # if not verify_module('matplotlib'):
    #     warn('No matplotlib, skipping plotting Seq2C')
    # else:
    #     Parallel(n_jobs=cnf.threads) \
    #         (delayed(draw_seq2c_plot)(CallCnf(cnf.__dict__), cnv_report_fpath, s.name,
    #                 cnf.output_dir, chr_lens=get_chr_lengths(cnf))
    #             for s in bcbio_structure.samples)
    #
    #     for s in bcbio_structure.samples:
    #         plot_fpath = draw_seq2c_plot(cnf, cnv_report_fpath, s.name, cnf.output_dir)
    info()
    info('*' * 70)
    if cnv_report_fpath:
        info('Seq2C:')
        if cnv_report_fpath:
            info('   ' + cnv_report_fpath)

    return [cnv_report_fpath]
예제 #13
0
def _snpsift_db_nsfp(cnf, input_fpath):
    if 'dbnsfp' not in cnf.annotation or 'dbnsfp' not in cnf.genome:
        return None

    step_greetings('DB SNFP')

    output_fpath = intermediate_fname(cnf, input_fpath, 'db_nsfp')
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    executable = get_java_tool_cmdline(cnf, 'snpsift')

    db_path = cnf['genome']['dbnsfp']
    if not verify_file(db_path, 'DB NSFP file'):
        err('DB NSFP file is incorrect. Skipping.')
        return None

    annotations = cnf.annotation['dbnsfp'].get('annotations') or []

    # all_fields.extend(['dbNSFP_' + ann for ann in annotations])

    ann_line = ('-f ' + ','.join(annotations)) if annotations else ''

    cmdline = '{executable} dbnsfp {ann_line} -v -db {db_path} ' \
              '{input_fpath}'.format(**locals())
    if call_subprocess(cnf,
                       cmdline,
                       input_fpath,
                       output_fpath,
                       stdout_to_outputfile=True,
                       exit_on_error=False,
                       overwrite=True):
        return verify_vcf(output_fpath, is_critical=True)
    else:
        return None
예제 #14
0
def _mongo(cnf, input_fpath):
    step_greetings('Annotating from Mongo')

    if 'mongo' not in cnf.annotation:
        return None

    executable = get_java_tool_cmdline(
        cnf, join('ext_tools', 'mongo_loader', 'VCFStore.jar'))
    output_fpath = intermediate_fname(cnf, input_fpath, 'mongo')
    project_name = cnf.project_name

    cmdline = ('{executable} -module annotation -inputFile {input_fpath} '
               ''
               '-outputFile {output_fpath} -project {project_name} ').format(
                   **locals())
    if call_subprocess(cnf,
                       cmdline,
                       input_fpath,
                       output_fpath,
                       stdout_to_outputfile=False,
                       exit_on_error=False):
        return output_fpath
    else:
        return None
예제 #15
0
def _report_normalize_coverage_for_variant_sites(cnf, summary_threads,
                                                 output_dir, samples, vcf_key,
                                                 bed_fpath):
    step_greetings('Combined normalized coverage for ' + vcf_key + ' hotspots')

    # vcf_fpath = cnf.genome.get(vcf_key)
    # if not vcf_fpath:
    #     err('Error: no ' + vcf_key + ' for ' + cnf.genome.name + ' VCF fpath specified in ' + cnf.sys_cnf)
    #     return None
    #
    # ave_coverages_per_sample = {
    #     s.name: get_ave_coverage(cnf, s.targetcov_json_fpath)
    #     for s in samples if verify_file(s.targetcov_json_fpath)}
    #
    # clipped_gz_vcf_fpath, regions_in_order, vars_by_region, var_by_site = \
    #     _read_vars_per_region_and_clip_vcf(cnf, vcf_fpath, bed_fpath)
    #
    # samtools = get_system_path(cnf, 'samtools')
    # bedtools = get_system_path(cnf, 'bedtools')
    #
    # vars_by_region_per_sample = OrderedDict(Parallel(n_jobs=summary_threads)
    #    (delayed(_get_depth_for_each_variant)(
    #         CallCnf(cnf.__dict__), samtools, bedtools, var_by_site, clipped_gz_vcf_fpath, bed_fpath,
    #         s.name, s.bam)
    #     for s in samples))

    ############################ Combined ############################
    # info()
    # info('*' * 70)
    # info('Saving for all samples: combined reports.')
    # # best_report = _prep_best_report(single_report_metric_storage, samples)
    # comb_report = _prep_comb_report(single_report_metric_storage, samples, shared_general_metrics, shared_metrics)
    #
    # total_variants = 0
    # nth_regions_from_each_sample = [s.report.get_regions() for s in samples]
    # while True:
    #     nth_region_from_each_sample = [rs[total_variants] for rs in nth_regions_from_each_sample if total_variants < len(rs)]
    #     total_variants += 1
    #     if len(nth_region_from_each_sample) == 0:
    #         break
    #     assert len(nth_region_from_each_sample) == len(nth_regions_from_each_sample), 'Region files for samples are not euqal size'
    #
    #     # best_report_reg = best_report.add_region()
    #     comb_report_reg = comb_report.add_region()
    #     rand_line = nth_region_from_each_sample[0]
    #     for i in range(9):
    #         # best_report_reg.records.append(rand_line.records[i])
    #         comb_report_reg.records.append(rand_line.records[i])
    #
    #     # best_depth = select_best(r.records[10].value for r in nth_region_from_each_sample)
    #     # best_norm_depth = select_best(r.records[11].value for r in nth_region_from_each_sample)
    #     # best_report_reg.add_record('Depth', best_depth)
    #     # best_report_reg.add_record('Norm depth', best_norm_depth)
    #
    #     for s, r in zip(samples, nth_region_from_each_sample):
    #         comb_report_reg.add_record(s.name + ' hotspots depths/norm depths', r.records[9].value)
    #
    # best_report_basename = 'Best.' + source.targetseq_name  + '_' + vcf_key
    # comb_report_basename = 'Comb.' + source.targetseq_name  + '_' + vcf_key
    # # best_targetcov_norm_depth_vcf_txt = best_report.save_txt(output_dir, best_report_basename)
    # # best_targetcov_norm_depth_vcf_tsv = best_report.save_tsv(output_dir, best_report_basename)
    # comb_targetcov_norm_depth_vcf_txt = comb_report.save_txt(output_dir, comb_report_basename)
    # comb_targetcov_norm_depth_vcf_tsv = comb_report.save_tsv(output_dir, comb_report_basename)
    # info('')
    # info('Depths for Oncomine variants (total: {0:,} variants, {0:,} regions) saved into:'.format(total_variants))
    # # info('  Best:     ' + best_targetcov_norm_depth_vcf_txt)
    # info('  Combined: ' + comb_targetcov_norm_depth_vcf_txt)
    #
    return None, None  # comb_targetcov_norm_depth_vcf_txt
예제 #16
0
def summarize_targqc(cnf,
                     summary_threads,
                     output_dir,
                     samples,
                     bed_fpath=None,
                     features_fpath=None,
                     tag_by_sample=None):
    step_greetings('TargQC coverage statistics for all samples')

    correct_samples = []

    for sample in samples:
        # if not sample.targetcov_done():
        #     err('Error: target coverage is not done (json, html, or detail tsv are not there)')
        # else:
        correct_samples.append(sample)
        # if not sample.ngscat_done():
        # sample.ngscat_html_fpath = None
        # if not sample.qualimap_done():
        # sample.qualimap_html_fpath = None
    samples = correct_samples

    # _make_targetcov_symlinks(samples)

    txt_fpath, tsv_fpath, html_fpath = _make_tarqc_html_report(
        cnf, output_dir, samples, bed_fpath, tag_by_sample=tag_by_sample)

    best_for_regions_fpath = None
    if any(
            verify_file(s.targetcov_detailed_tsv, silent=True)
            for s in samples):
        best_for_regions_fpath = _save_best_details_for_each_gene(
            cnf.coverage_reports.depth_thresholds, samples, output_dir)
    ''' 1. best_regions = get_best_regions()
        2. best_for_regions_fpath = save_per_region_report()
        3. calc median coverage across best regions
        4. flagged_regions_report_fpath = _generate_flagged_regions_report(
             output_dir, 'Best', average_coverage, genes, depth_threshs)
    '''

    if cnf.extended:
        if not features_fpath or not bed_fpath:
            err('For the extended analysis, capture and features BED files are required!'
                )
        else:
            features_bed, features_no_genes_cut_bed, target_bed, _ = prepare_beds(
                cnf, features_fpath, bed_fpath)

            #norm_best_var_fpath, norm_comb_var_fpath = _report_normalize_coverage_for_variant_sites(
            #    cnf, summary_threads, output_dir, samples, 'oncomine', bed_fpath)

    info()
    info('*' * 70)
    if not html_fpath and not txt_fpath:
        info(
            'TargQC summary was not generated, because there were no reports generated for individual samples.'
        )
    else:
        info('TargQC summary saved in: ')
        for fpath in [txt_fpath, html_fpath]:
            if fpath: info('  ' + fpath)

    if best_for_regions_fpath:
        info()
        info('Best stats for regions saved in:')
        info('  ' + best_for_regions_fpath)

    # if cnf.extended:
    #     if norm_best_var_fpath:
    #         info()
    #         info('Normalized depths for oncomine saved in:')
    #         info('        ' + norm_comb_var_fpath)
    #         info('  Best: ' + norm_best_var_fpath)

    return html_fpath
예제 #17
0
def make_report(cnf, vcf_fpath, sample):
    set_db_versions(cnf)
    step_greetings('Quality control reports')

    total_with_rejected = 0
    total = 0
    snps = 0
    inss = 0
    dels = 0
    dbsnps = 0
    cosmics = 0
    novels = 0
    hets = 0
    homs = 0
    transitions = 0
    transversions = 0

    with open_gzipsafe(vcf_fpath) as f:
        reader = vcf_parser.Reader(f)
        for rec in (vcf_processing.Record(rec, vcf_fpath, i)
                    for i, rec in enumerate(reader)):
            total_with_rejected += 1

            if not rec.FILTER or rec.FILTER == 'PASS':
                if rec.FILTER:
                    warn('Warn: ' + rec.get_variant() + ' FILTER=' +
                         str(rec.FILTER))

                total += 1

                if rec.is_snp:
                    snps += 1
                    if rec.is_transition:
                        transitions += 1
                    elif len(rec.ALT) == 1:
                        transversions += 1
                elif rec.is_indel:
                    if rec.is_deletion:
                        dels += 1
                    elif len(rec.ALT) == 1:
                        inss += 1

                if not rec.ID:
                    novels += 1
                else:
                    ids = rec.ID
                    if isinstance(ids, basestring):
                        ids = [ids]
                    if any(id.startswith('COS') for id in ids):
                        cosmics += 1
                    if any(id.startswith('rs') for id in ids):
                        dbsnps += 1

                call = rec.samples[0]
                if call.called:
                    if call.gt_type == 1:
                        hets += 1
                    elif call.gt_type == 2:
                        homs += 1

    report = SampleReport(sample, metric_storage=metric_storage)
    report.add_record('Total variants', total)
    report.add_record('SNPs', snps)
    report.add_record('Insertions', inss)
    report.add_record('Deletions', dels)
    report.add_record('Novel', novels)
    report.add_record('Novel, %', 1.0 * novels / total if total else None)
    report.add_record('In dbSNP', dbsnps)
    report.add_record('In dbSNP, %', 1.0 * dbsnps / total if total else None)
    report.add_record('In Cosmic', cosmics)
    report.add_record('In Cosmic, %', 1.0 * cosmics / total if total else None)
    report.add_record('Het/hom', float(hets) / homs if homs != 0 else None)
    report.add_record(
        'Ti/tv',
        float(transitions) / transversions if transversions != 0 else None)
    report.add_record('Total with rejected', total_with_rejected)

    return report
예제 #18
0
def _snpsift_annotate(cnf, vcf_conf, dbname, input_fpath):
    if not vcf_conf:
        err('No database for ' + dbname + ', skipping.')
        return None

    step_greetings('Annotating with ' + dbname)

    output_fpath = intermediate_fname(cnf, input_fpath, dbname)
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    executable = get_java_tool_cmdline(cnf, 'snpsift')
    java = get_system_path(cnf, 'java')
    info('Java version:')
    call(cnf, java + ' -version')
    info()

    db_path = cnf['genome'].get(dbname)
    if not db_path:
        db_path = vcf_conf.get('path')
        if not db_path:
            err('Please, provide a path to ' + dbname +
                ' in the "genomes" section in the system config. The config is: '
                + str(cnf['genome']))
            return
        verify_file(db_path, is_critical=True)

    annotations = vcf_conf.get('annotations')

    if not cnf.no_check:
        info('Removing previous annotations...')

        def delete_annos(rec):
            for anno in annotations:
                if anno in rec.INFO:
                    del rec.INFO[anno]
            return rec

        if annotations:
            input_fpath = iterate_vcf(cnf,
                                      input_fpath,
                                      delete_annos,
                                      suffix='d')

    anno_line = ''
    if annotations:
        anno_line = '-info ' + ','.join(annotations)

    cmdline = '{executable} annotate -v {anno_line} {db_path} {input_fpath}'.format(
        **locals())
    output_fpath = call_subprocess(cnf,
                                   cmdline,
                                   input_fpath,
                                   output_fpath,
                                   stdout_to_outputfile=True,
                                   exit_on_error=False,
                                   overwrite=True)
    if not output_fpath:
        err('Error: snpsift resulted ' + str(output_fpath) + ' for ' + dbname)
        return output_fpath
    verify_vcf(output_fpath, is_critical=True)
    # f = open(output_fpath)
    # l = f.readline()
    # if 'Cannot allocate memory' in l:
    #     f.close()
    #     f = open(output_fpath)
    #     contents = f.read()
    #     critical('SnpSift failed with memory issue:\n' + contents)
    #     f.close()
    #     return None

    if not cnf.no_check:
        info_pattern = re.compile(
            r'''\#\#INFO=<
            ID=(?P<id>[^,]+),\s*
            Number=(?P<number>-?\d+|\.|[AG]),\s*
            Type=(?P<type>Integer|Float|Flag|Character|String),\s*
            Description="(?P<desc>[^"]*)"
            >''', re.VERBOSE)

        def _fix_after_snpsift(line, i, ctx):
            if not line.startswith('#'):
                if not ctx['met_CHROM']:
                    return None
                line = line.replace(' ', '_')
                assert ' ' not in line

            # elif line.startswith('##INFO=<ID=om'):
            #     line = line.replace(' ', '')

            elif not ctx['met_CHROM'] and line.startswith('#CHROM'):
                ctx['met_CHROM'] = True

            elif line.startswith('##INFO'):
                m = info_pattern.match(line)
                if m:
                    line = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format(
                        m.group('id'), m.group('number'), m.group('type'),
                        m.group('desc'))
            return line

        output_fpath = iterate_file(cnf,
                                    output_fpath,
                                    _fix_after_snpsift,
                                    suffix='fx',
                                    ctx=dict(met_CHROM=False))

    return verify_vcf(output_fpath, is_critical=True)
예제 #19
0
def make_report_metadata(cnf,
                         bcbio_structure,
                         oncoprints_link=None,
                         circos_link=None):
    step_greetings('Making the %s project-level report' %
                   ('preproc' if bcbio_structure is None else 'postproc'))

    # if dataset_structure is None and bcbio_structure:
    #     analysis_dirpath = normpath(join(bcbio_structure.bcbio_project_dirpath, pardir))
    #     dataset_dirpath = realpath(join(analysis_dirpath, 'dataset'))
    #     dataset_structure = DatasetStructure.create(dataset_dirpath, bcbio_structure.project_name)

    general_records = _add_summary_reports(cnf, metric_storage.general_section,
                                           bcbio_structure)
    # sample_reports_records = _add_per_sample_reports(cnf, metric_storage.sections[0], bcbio_structure)

    # sample_reports = []
    # if dataset_project:
    #     samples = dataset_project.sample_by_name.values()
    # if bcbio_structure:
    samples = bcbio_structure.samples
    # for sample in samples:
    #     sample_reports.append(SampleReport(sample,
    #         records=sample_reports_records[sample.name],
    #         html_fpath=None,
    #         metric_storage=metric_storage))

    full_report = FullReport(cnf.project_name, [],
                             metric_storage=metric_storage,
                             general_records=general_records)

    project_report_html_fpath = bcbio_structure.multiqc_fpath
    project_name = bcbio_structure.project_name

    additional_data = dict()
    normal_samples = [
        s for s in bcbio_structure.samples if s.phenotype == 'normal'
    ]
    if normal_samples:
        sample_match_on_hover_js = '<script type="text/javascript">\n'
        for s in bcbio_structure.samples:
            if s.phenotype != 'normal' and s.normal_match:
                sample_match_on_hover_js += (
                    '' + '\tdocument.getElementById("' + s.name +
                    '_match").onmouseover = function() { document.getElementById("'
                    + s.normal_match.name +
                    '").style.backgroundColor = "#EEE"; };\n' +
                    '\tdocument.getElementById("' + s.name +
                    '_match").onmouseleave = function() { document.getElementById("'
                    + s.normal_match.name +
                    '").style.backgroundColor = "white"; };\n')
        sample_match_on_hover_js += '</script>\n'
        additional_data['sample_match_on_hover_js'] = sample_match_on_hover_js

    metadata = _report_to_multiqc_metadata(cnf,
                                           full_report,
                                           project_report_html_fpath,
                                           project_name,
                                           bcbio_structure,
                                           additional_data=additional_data,
                                           oncoprints_link=oncoprints_link,
                                           circos_link=circos_link)

    metadata_fpath = join(bcbio_structure.work_dir, 'az_multiqc_metadata.yaml')
    import yaml
    with open(metadata_fpath, 'w') as outfile:
        yaml.dump(metadata, outfile, default_flow_style=False)
    import json
    with open(metadata_fpath.replace('.yaml', '.json'), 'w') as outfile:
        json.dump(metadata, outfile)

    return metadata_fpath.replace('.yaml', '.json')
예제 #20
0
def _snpeff(cnf, input_fpath):
    if 'snpeff' not in cnf.annotation or 'snpeff' not in cnf.genome:
        return None, None, None

    step_greetings('SnpEff')

    output_fpath = intermediate_fname(cnf, input_fpath, 'snpEff')
    stats_fpath = join(
        cnf.work_dir, cnf.sample + (('-' + cnf.caller) if cnf.caller else '') +
        '.snpEff_summary.csv')

    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath, stats_fpath, splitext(
            stats_fpath)[0] + '.genes.txt'

    snpeff = get_java_tool_cmdline(cnf, 'snpeff')

    ref_name = cnf.genome.snpeff.reference or cnf.genome.name
    if ref_name.startswith('hg19') or ref_name.startswith('GRCh37'):
        ref_name = 'GRCh37.75'
    if ref_name.startswith('hg38'): ref_name = 'GRCh38.82'

    opts = ''
    if cnf.annotation.snpeff.cancer: opts += ' -cancer'

    assert cnf.transcripts_fpath, 'Transcript for annotation must be specified!'
    verify_file(cnf.transcripts_fpath,
                'Transcripts for snpEff -onlyTr',
                is_critical=True)
    opts += ' -onlyTr ' + cnf.transcripts_fpath + ' '

    db_path = adjust_system_path(cnf.genome.snpeff.data)
    if db_path:
        opts += ' -dataDir ' + db_path
    elif cnf.resources.snpeff.config:
        conf = get_system_path(cnf, cnf.resources.snpeff.config)
        if conf:
            opts += ' -c ' + conf + ' '
        else:
            err('Cannot find snpEff config file ' +
                str(cnf.resources.snpeff.config))

    if cnf.annotation.snpeff.extra_options:
        opts += ''

    if not cnf.no_check:
        info('Removing previous snpEff annotations...')
        res = remove_prev_eff_annotation(cnf, input_fpath)
        if not res:
            err('Could not remove preivous snpEff annotations')
            return None, None, None
        input_fpath = res

    snpeff_type = get_snpeff_type(snpeff)
    if snpeff_type == "old":
        opts += ' -stats ' + stats_fpath + ' -csvStats'
    else:
        opts += ' -csvStats ' + stats_fpath

    cmdline = '{snpeff} eff {opts} -noLog -i vcf -o vcf {ref_name} {input_fpath}'.format(
        **locals())

    for i in range(1, 20):
        try:
            res = call_subprocess(cnf,
                                  cmdline,
                                  input_fpath,
                                  output_fpath,
                                  exit_on_error=False,
                                  stdout_to_outputfile=True,
                                  overwrite=True)
        except OSError:
            import traceback, time
            err(traceback.format_exc())
            warn()
            info('Waiting 1 minute')
            time.sleep(60)
            info('Rerunning ' + str(i))
        else:
            break

    output_fpath = verify_vcf(output_fpath, is_critical=True)

    snpeff_summary_html_fpath = 'snpEff_summary.html'
    if isfile(snpeff_summary_html_fpath):
        info('SnpEff created ' + snpeff_summary_html_fpath +
             ' in the cwd, removing it...')
        try:
            os.remove(snpeff_summary_html_fpath)
        except OSError:
            pass

    if res:
        return output_fpath, stats_fpath, splitext(
            stats_fpath)[0] + '.genes.txt'
    else:
        return None, None, None
def create_oncoprints_link(cnf, bcbio_structure, project_name=None):
    if is_us(): loc = exposing.us
    # elif is_uk(): loc = exposing.uk
    else:
        loc = exposing.local
        return None

    if not bcbio_structure.variant_callers:
        info('No varianting calling performed, not generating Oncoprints')
        return None
    clinical_report_caller = \
        bcbio_structure.variant_callers.get('vardict') or \
        bcbio_structure.variant_callers.get('vardict-java')
    if not clinical_report_caller:
        err('Warning: vardict is not in the variant callers list, this not generating Oncoprints')
        return None

    step_greetings('Creating Oncoprints link')
    zhongwu_data_query_dirpath = '/home/kdld047/public_html/cgi-bin/TS'
    if not isdir(zhongwu_data_query_dirpath):
        warn('Data Query directory ' + zhongwu_data_query_dirpath + ' does not exists.')
        return None

    vardict_txt_fname = variant_filtering.mut_fname_template.format(caller_name=clinical_report_caller.name)
    vardict_txt_fpath = join(bcbio_structure.var_dirpath, vardict_txt_fname)
    cnf.mutations_fpath = add_suffix(vardict_txt_fpath, variant_filtering.mut_pass_suffix)

    cnf.seq2c_tsv_fpath = bcbio_structure.seq2c_fpath

    samples = sorted(bcbio_structure.samples)
    cnf.project_name = project_name or bcbio_structure.project_name or basename(cnf.output_dir)
    study_name = re.sub('[\.\-:&]', '_', cnf.project_name)

    check_genome_resources(cnf)

    data_query_dirpath = join(loc.dirpath, 'DataQueryTool')

    data_fpath = join(zhongwu_data_query_dirpath, study_name + '.data.txt')
    info_fpath = join(zhongwu_data_query_dirpath, study_name + '.info.txt')
    altered_genes = print_data_txt(cnf, cnf.mutations_fpath, cnf.seq2c_tsv_fpath, samples, data_fpath)
    if not altered_genes:
        err('No altered genes in ' + cnf.mutations_fpath + ' or ' + cnf.seq2c_tsv_fpath + ', not generating Oncoptints.')
        return None

    print_info_txt(cnf, samples, info_fpath)

    data_ext_fpath = data_fpath.replace('/home/', '/users/')
    info_ext_fpath = info_fpath.replace('/home/', '/users/')

    # optional:
    data_symlink = join(data_query_dirpath, study_name + '.data.txt')
    info_symlink = join(data_query_dirpath, study_name + '.info.txt')
    (symlink_to_ngs if is_us() else local_symlink)(data_ext_fpath, data_symlink)
    (symlink_to_ngs if is_us() else local_symlink)(info_ext_fpath, info_symlink)

    properties_fpath = join(zhongwu_data_query_dirpath, 'DataQuery.properties')
    add_data_query_properties(cnf, study_name, properties_fpath, data_ext_fpath, info_ext_fpath)

    genes = '%0D%0A'.join(altered_genes)
    data_query_url = join(loc.website_url_base, 'DataQueryTool', 'DataQuery.pl?'
        'analysis=oncoprint&'
        'study={study_name}&'
        'gene={genes}&'
        'order=on&'
        'freq=50&'
        'nocheckgenes=true&'
        'submit=Submit'
        .format(**locals()))

    info()
    info('Information about study was added in Data Query Tool, URL is ' + data_query_url)
    return data_query_url
예제 #22
0
def run_annotators(cnf, vcf_fpath, bam_fpath):
    original_vcf = cnf.vcf

    db_section_by_name = OrderedDict(
        (dbname, cnf.annotation[dbname])
        for dbname in ['dbsnp', 'clinvar', 'cosmic', 'oncomine']
        if dbname in cnf.annotation
        and not cnf.annotation[dbname].get('skip-annotation'))

    # if not cnf.no_check:
    #     to_delete_id_ref = []
    #     if 'dbsnp' in db_section_by_name.keys():
    #         info('Removing IDs from dbsnp as rs*')
    #         to_delete_id_ref.append('rs')
    #     if 'cosmic' in db_section_by_name.keys():
    #         info('Removing IDs from dbsnp as COS*')
    #         to_delete_id_ref.append('COS')
    #
    #     def delete_ids(rec):  # deleting existing dbsnp and cosmic ID annotations
    #         if rec.ID:
    #             if isinstance(rec.ID, basestring):
    #                 if any(rec.ID.startswith(pref) for pref in to_delete_id_ref):
    #                     rec.ID = None
    #             else:
    #                 rec.ID = [id_ for id_ in rec.ID if not any(id_.startswith(pref) for pref in to_delete_id_ref)]
    #
    #         if not rec.FILTER:
    #             rec.FILTER = 'PASS'
    #
    #         return rec
    #
    #     info('Removing previous rs* and COS* IDs')
    #     vcf_fpath = iterate_vcf(cnf, vcf_fpath, delete_ids, suffix='delID')

    bcftools = get_system_path(cnf, 'bcftools')

    if not vcf_fpath.endswith('.gz') or not file_exists(vcf_fpath + '.tbi'):
        vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    cmdl = '{bcftools} annotate --remove ID {vcf_fpath}'
    res = call(cnf,
               cmdl.format(**locals()),
               output_fpath=add_suffix(rm_gz_ext(vcf_fpath), 'rmid'))
    if res:
        vcf_fpath = res
        vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    for dbname, dbconf in db_section_by_name.items() + cnf.annotation.get(
            'custom_vcfs', dict()).items():
        step_greetings('Annotating using ' + dbname)
        annotations = ','.join('INFO/' + a for a in dbconf.get('annotations'))
        if dbname in ('cosmic', 'dbsnp'):
            annotations += ',=ID'
        db_fpath = get_db_path(cnf, dbconf, dbname)
        if db_fpath:
            cmdl = '{bcftools} annotate -a ' + db_fpath + ' -c ' + annotations + ' {vcf_fpath}'
            res = call(cnf,
                       cmdl.format(**locals()),
                       output_fpath=add_suffix(rm_gz_ext(vcf_fpath), dbname))
            if res:
                vcf_fpath = res
                vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    verify_vcf(vcf_fpath, is_critical=True)

    if 'dbnsfp' in cnf.annotation:
        res = _snpsift_db_nsfp(cnf, vcf_fpath)
        if res:
            vcf_fpath = res

    if 'snpeff' in cnf.annotation:
        res, summary_fpath, genes_fpath = _snpeff(cnf, vcf_fpath)
        if res:
            vcf_fpath = res
            verify_vcf(vcf_fpath, is_critical=True)
            final_summary_fpath = join(cnf.output_dir, basename(summary_fpath))
            final_genes_fpath = join(cnf.output_dir, basename(genes_fpath))
            if isfile(final_summary_fpath): os.remove(final_summary_fpath)
            if isfile(final_genes_fpath): os.remove(final_genes_fpath)
            if file_exists(summary_fpath):
                shutil.move(summary_fpath, final_summary_fpath)
            if file_exists(genes_fpath):
                shutil.move(genes_fpath, final_genes_fpath)

    if 'tracks' in cnf.annotation and cnf.annotation[
            'tracks'] and cnf.annotation['tracks']:
        track_fapths = []
        for track_name in cnf.annotation['tracks']:
            if isfile(track_name) and verify_file(track_name):
                track_fapths.append(track_name)
            else:
                if 'tracks' in cnf['genome'] and cnf['genome'][
                        'tracks'] and track_name in cnf['genome']['tracks']:
                    track_fpath = cnf['genome']['tracks'][track_name]
                    if verify_file(track_fpath):
                        track_fapths.append(track_fpath)
        for track_fapth in track_fapths:
            res = _tracks(cnf, track_fapth, vcf_fpath)
            if res:
                vcf_fpath = res

    step_greetings('Intersection with database VCFs...')
    if 'intersect_with' in cnf.annotation:
        for key, db_fpath in cnf.annotation['intersect_with'].items():
            res = intersect_vcf(cnf,
                                input_fpath=vcf_fpath,
                                db_fpath=db_fpath,
                                key=key)
            if res:
                vcf_fpath = res

    if 'mongo' in cnf.annotation:
        res = _mongo(cnf, vcf_fpath)
        if res:
            vcf_fpath = res

    return vcf_fpath