示例#1
0
def make_general_reports(view, samples, target, genome, depth_threshs, bed_padding,
                         num_pairs_by_sample=None, reuse=False, is_debug=False, reannotate=False, fai_fpath=None):
    if all(all(can_reuse(fp, [s.bam, target.qualimap_bed_fpath] if target.bed else s.bam)
               for fp in _qualimap_outputs(s))
           for s in samples):
        debug('All QualiMap files for all samples exist and newer than BAMs and BEDs, reusing')
    else:
        info('Running QualiMap...')
        view.run(runner.run_qualimap,
            [[s.work_dir, s.qualimap_dirpath, _qualimap_outputs(s), s.bam, genome, target.qualimap_bed_fpath, view.cores_per_job]
             for s in samples])

        for s in samples:
            for fp in _qualimap_outputs(s):
                verify_file(fp, is_critical=True)

    summary_reports = []

    for sample in samples:
        info('-'*70)
        info(sample.name)
        debug('-'*70)
        debug('Parsing QualiMap results...')
        depth_stats, reads_stats, indels_stats, target_stats = parse_qualimap_results(sample)

        _prep_report_data(sample, depth_stats, reads_stats, indels_stats, target_stats,
                          target, num_pairs_by_sample, genome, depth_threshs, fai_fpath=fai_fpath)

        r = _build_report(depth_stats, reads_stats, indels_stats, sample, target,
                          depth_threshs, bed_padding, sample_num=len(samples), is_debug=is_debug,
                          reannotate=reannotate)
        summary_reports.append(r)

    return summary_reports
示例#2
0
def combined_regional_reports(work_dir, output_dir, samples):
    if not any(verify_file(s.targqc_region_tsv, silent=True) for s in samples):
        return None, None

    tsv_region_rep_fpath = join(output_dir,
                                basename(samples[0].targqc_region_tsv))
    debug('Combining regional reports, writing to ' + tsv_region_rep_fpath)
    with file_transaction(work_dir, tsv_region_rep_fpath) as tx_tsv:
        with open(tx_tsv, 'w') as tsv_out:
            # sample_i = 0
            # for s in samples:
            #     if s.targqc_region_txt and verify_file(s.targqc_region_txt):
            #         with open(s.targqc_region_txt) as txt_in:
            #             for l in txt_in:
            #                 if l.startswith('#'):
            #                     if not l.startswith('##') and sample_i == 0:
            #                         txt_out.write('#Sample' + ' '*(max(len('#Sample'), len(s.name)) - len('#Sample')) + ' ' + l.replace('#Chr', 'Chr '))
            #                 else:
            #                     txt_out.write(s.name + ' '*(max(len('#Sample'), len(s.name)) - len(s.name)) + ' ' + l)
            #         sample_i += 1
            sample_i = 0
            for s in samples:
                if s.targqc_region_tsv and verify_file(s.targqc_region_tsv):
                    with open(s.targqc_region_tsv) as tsv_in:
                        for i, l in enumerate(tsv_in):
                            if i == 0:
                                if sample_i == 0:
                                    tsv_out.write('sample\t' + l)
                            else:
                                tsv_out.write(s.name + '\t' + l)
                    sample_i += 1

    return tsv_region_rep_fpath
示例#3
0
def _correct_qualimap_insert_size_histogram(samples):
    """ replacing Qualimap insert size histogram with Picard one.
    """
    for s in samples:
        qualimap1_dirname = dirname(s.qualimap_ins_size_hist_fpath).replace('raw_data_qualimapReport', 'raw_data')
        qualimap2_dirname = dirname(s.qualimap_ins_size_hist_fpath)
        if exists(qualimap1_dirname):
            if not exists(qualimap2_dirname):
                shutil.move(qualimap1_dirname, qualimap2_dirname)
            else:
                shutil.rmtree(qualimap1_dirname)
        elif not exists(qualimap2_dirname):
            continue  # no data from both Qualimap v.1 and Qualimap v.2

        # if qualimap histogram exits and reuse_intermediate, skip
        if verify_file(s.qualimap_ins_size_hist_fpath, silent=True) and cfg.reuse_intermediate:
            pass
        else:
            if verify_file(s.picard_ins_size_hist_txt_fpath):
                with open(s.picard_ins_size_hist_txt_fpath, 'r') as picard_f:
                    one_line_to_stop = False
                    for line in picard_f:
                        if one_line_to_stop:
                            break
                        if line.startswith('## HISTOGRAM'):
                            one_line_to_stop = True

                    with file_transaction(None, s.qualimap_ins_size_hist_fpath) as tx:
                        with open(tx, 'w') as qualimap_f:
                            for line in picard_f:
                                qualimap_f.write(line)
示例#4
0
def read_biomart(genome_name):
    features_by_ens_id = dict()
    bm_fpath = ebl.biomart_fpath(genome_name)
    if not verify_file(bm_fpath):
        warn('Warning: biomart file for genome ' + genome_name +
             ' not found, skip using the TSL values')
        return dict()

    with open(bm_fpath) as f:
        for r in csv.DictReader(f, delimiter='\t'):
            features_by_ens_id[r['Transcript ID']] = r

    # hg38 version has TSL, checking if we can populate some TSL from it
    if not genome_name.startswith('hg38'):
        bm_fpath = ebl.biomart_fpath('hg38')
        if not verify_file(bm_fpath):
            critical(
                'Biomart for hg38 file not found, and needed for TSL values')
        with open(bm_fpath) as f:
            for r in csv.DictReader(f, delimiter='\t'):
                if r['Transcript ID'] not in features_by_ens_id:
                    features_by_ens_id[r['Transcript ID']] = r
                else:
                    features_by_ens_id[r['Transcript ID']][
                        'Transcript Support Level (TSL)'] = r[
                            'Transcript Support Level (TSL)']
    return features_by_ens_id
示例#5
0
def sort_bed_gsort(input_bed_fpath,
                   output_bed_fpath=None,
                   work_dir=None,
                   fai_fpath=None,
                   genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()),
            output_fpath=tx)

    return output_bed_fpath
示例#6
0
def combined_regional_reports(work_dir, output_dir, samples):
    if not any(verify_file(s.targqc_region_tsv, silent=True) for s in samples):
        return None, None

    tsv_region_rep_fpath = join(output_dir, basename(samples[0].targqc_region_tsv))
    debug('Combining regional reports, writing to ' + tsv_region_rep_fpath)
    with file_transaction(work_dir, tsv_region_rep_fpath) as tx_tsv:
        with open(tx_tsv, 'w') as tsv_out:
            # sample_i = 0
            # for s in samples:
            #     if s.targqc_region_txt and verify_file(s.targqc_region_txt):
            #         with open(s.targqc_region_txt) as txt_in:
            #             for l in txt_in:
            #                 if l.startswith('#'):
            #                     if not l.startswith('##') and sample_i == 0:
            #                         txt_out.write('#Sample' + ' '*(max(len('#Sample'), len(s.name)) - len('#Sample')) + ' ' + l.replace('#Chr', 'Chr '))
            #                 else:
            #                     txt_out.write(s.name + ' '*(max(len('#Sample'), len(s.name)) - len(s.name)) + ' ' + l)
            #         sample_i += 1
            sample_i = 0
            for s in samples:
                if s.targqc_region_tsv and verify_file(s.targqc_region_tsv):
                    with open(s.targqc_region_tsv) as tsv_in:
                        for i, l in enumerate(tsv_in):
                            if i == 0:
                                if sample_i == 0:
                                    tsv_out.write('sample\t' + l)
                            else:
                                tsv_out.write(s.name + '\t' + l)
                    sample_i += 1

    return tsv_region_rep_fpath
示例#7
0
def make_tarqc_html_report(output_dir, work_dir, samples, bed_fpath=None, tag_by_sample=None):
    # header_storage = get_header_metric_storage(tc.depth_thresholds,
    #                                            is_wgs=bed_fpath is not None,
    #                                            padding=tc.padding)

    jsons_by_sample = {s.name: s.targqc_json_fpath for s in samples if verify_file(s.targqc_json_fpath)}
    # htmls_by_sample = {s.name: s.targqc_html_fpath for s in samples if verify_file(s.targqc_html_fpath)}
    htmls_by_sample = dict()

    if not jsons_by_sample:
        return None, None, None

    targqc_full_report = FullReport.construct_from_sample_report_jsons(samples, output_dir, jsons_by_sample, htmls_by_sample)

    for sample_report in targqc_full_report.sample_reports:
        if tag_by_sample:
            sample_report.set_project_tag(tag_by_sample[sample_report.sample.name])
        if verify_file(sample_report.sample.qualimap_html_fpath):
            url = relpath(sample_report.sample.qualimap_html_fpath, output_dir)
            r = sample_report.find_record(sample_report.records, 'Qualimap')
            if r:
                r.url = url
            else:
                sample_report.add_record(metric_name='Qualimap', value='Qualimap', url=url, silent=True)

    if len(samples) > 1:
        run_multisample_qualimap(output_dir, work_dir, samples, targqc_full_report)

    fn = splitext(basename(samples[0].targqc_txt_fpath))[0]
    tsv_fpath = targqc_full_report.save_tsv(join(output_dir, fn + '.tsv'))
    html_fpath = targqc_full_report.save_html(join(output_dir, fn + '.html'), 'TargQC')

    return tsv_fpath, html_fpath
示例#8
0
def _correct_qualimap_insert_size_histogram(work_dir, samples):
    """ replacing Qualimap insert size histogram with Picard one.
    """
    for s in samples:
        qualimap1_dirname = dirname(s.qualimap_ins_size_hist_fpath).replace(
            'raw_data_qualimapReport', 'raw_data')
        qualimap2_dirname = dirname(s.qualimap_ins_size_hist_fpath)
        if exists(qualimap1_dirname):
            if not exists(qualimap2_dirname):
                shutil.move(qualimap1_dirname, qualimap2_dirname)
            else:
                shutil.rmtree(qualimap1_dirname)
        elif not exists(qualimap2_dirname):
            continue  # no data from both Qualimap v.1 and Qualimap v.2

        # if qualimap histogram exits and reuse_intermediate, skip
        if verify_file(s.qualimap_ins_size_hist_fpath,
                       silent=True) and tc.reuse_intermediate:
            pass
        else:
            if verify_file(s.picard_ins_size_hist_txt_fpath):
                with open(s.picard_ins_size_hist_txt_fpath, 'r') as picard_f:
                    one_line_to_stop = False
                    for line in picard_f:
                        if one_line_to_stop:
                            break
                        if line.startswith('## HISTOGRAM'):
                            one_line_to_stop = True

                    with file_transaction(
                            work_dir, s.qualimap_ins_size_hist_fpath) as tx:
                        with open(tx, 'w') as qualimap_f:
                            for line in picard_f:
                                qualimap_f.write(line)
示例#9
0
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
    try:
        dic = load_yaml(open(fpath))
    except Exception:
        err(format_exc())
        critical('Could not parse bcbio YAML ' + fpath)
    else:
        return dic
示例#10
0
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
    try:
        dic = load_yaml(open(fpath))
    except Exception:
        err(format_exc())
        critical('Could not parse bcbio YAML ' + fpath)
    else:
        return dic
示例#11
0
def sort_bed(input_bed_fpath,
             output_bed_fpath=None,
             work_dir=None,
             fai_fpath=None,
             chr_order=None,
             genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    regions = []

    if not chr_order:
        if fai_fpath:
            fai_fpath = verify_file(fai_fpath)
        elif genome:
            fai_fpath = verify_file(ref.get_fai(genome))
        else:
            critical(
                'Either of chr_order, fai_fpath, or genome build name must be specified'
            )
        chr_order = get_chrom_order(fai_fpath=fai_fpath)

    with open(input_bed_fpath) as f:
        with file_transaction(work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(
                        Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    debug('Sorted ' + str(len(regions)) + ' regions, saved to ' +
          output_bed_fpath)
    return output_bed_fpath
示例#12
0
def run(cmd, output_fpath=None, input_fpath=None, checks=None, stdout_to_outputfile=True,
        stdout_tx=True, reuse=False, env_vars=None):
    """Run the provided command, logging details and checking for errors.
    """
    if output_fpath and reuse:
        if verify_file(output_fpath, silent=True):
            info(output_fpath + ' exists, reusing')
            return output_fpath
        if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True):
            info(output_fpath + '.gz exists, reusing')
            return output_fpath

    env = os.environ.copy()
    if env_vars:
        for k, v in env_vars.items():
            if v is None:
                if k in env:
                    del env[k]
            else:
                env[k] = v

    if checks is None:
        checks = [file_nonempty_check]

    def _try_run(_cmd, _output_fpath, _input_fpath):
        try:
            info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, six.string_types) else _cmd)
            _do_run(_cmd, checks, env, _output_fpath, _input_fpath)
        except:
            raise

    if output_fpath:
        if isfile(output_fpath):
            os.remove(output_fpath)
    if output_fpath:
        if stdout_tx:
            with file_transaction(None, output_fpath) as tx_out_file:
                if stdout_to_outputfile:
                    cmd += ' > ' + tx_out_file
                else:
                    cmd += '\n'
                    cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \
                             .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \
                             .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \
                             .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \
                             .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \
                             .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \
                             .replace('\n', '')
                _try_run(cmd, tx_out_file, input_fpath)
        else:
            _try_run(cmd, output_fpath, input_fpath)

    else:
        _try_run(cmd, None, input_fpath)
示例#13
0
    def _make_padded_bed(self, work_dir, fai_fpath, padding):
        if self.is_wgs:
            return None

        self.padded_bed_fpath = intermediate_fname(work_dir, self.capture_bed_fpath, 'padded')
        if can_reuse(self.padded_bed_fpath, self.capture_bed_fpath):
            return BedTool(self.padded_bed_fpath)

        padded_bed = self.bed.slop(b=padding, g=fai_fpath).sort().merge()
        with file_transaction(work_dir, self.padded_bed_fpath) as tx:
            padded_bed.saveas(tx)
        verify_file(self.padded_bed_fpath, is_critical=True)
        return BedTool(self.padded_bed_fpath)
示例#14
0
    def _make_padded_bed(self, work_dir, fai_fpath, padding):
        if self.is_wgs:
            return None

        self.padded_bed_fpath = intermediate_fname(work_dir,
                                                   self.capture_bed_fpath,
                                                   'padded')
        if can_reuse(self.padded_bed_fpath, self.capture_bed_fpath):
            return BedTool(self.padded_bed_fpath)

        padded_bed = self.bed.slop(b=padding, g=fai_fpath).sort().merge()
        with file_transaction(work_dir, self.padded_bed_fpath) as tx:
            padded_bed.saveas(tx)
        verify_file(self.padded_bed_fpath, is_critical=True)
        return BedTool(self.padded_bed_fpath)
示例#15
0
def _correct_qualimap_genome_results(samples):
    """ fixing java.lang.Double.parseDouble error on entries like "6,082.49"
    """
    for s in samples:
        if verify_file(s.qualimap_genome_results_fpath):
            correction_is_needed = False
            with open(s.qualimap_genome_results_fpath, 'r') as f:
                content = f.readlines()
                metrics_started = False
                for line in content:
                    if ">> Reference" in line:
                        metrics_started = True
                    if metrics_started:
                        if line.find(',') != -1:
                            correction_is_needed = True
                            break
            if correction_is_needed:
                with open(s.qualimap_genome_results_fpath, 'w') as f:
                    metrics_started = False
                    for line in content:
                        if ">> Reference" in line:
                            metrics_started = True
                        if metrics_started:
                            if line.find(',') != -1:
                                line = line.replace(',', '')
                        f.write(line)
示例#16
0
def _correct_qualimap_genome_results(samples):
    """ fixing java.lang.Double.parseDouble error on entries like "6,082.49"
    """
    for s in samples:
        if verify_file(s.qualimap_genome_results_fpath):
            correction_is_needed = False
            with open(s.qualimap_genome_results_fpath, 'r') as f:
                content = f.readlines()
                metrics_started = False
                for line in content:
                    if ">> Reference" in line:
                        metrics_started = True
                    if metrics_started:
                        if line.find(',') != -1:
                            correction_is_needed = True
                            break
            if correction_is_needed:
                with open(s.qualimap_genome_results_fpath, 'w') as f:
                    metrics_started = False
                    for line in content:
                        if ">> Reference" in line:
                            metrics_started = True
                        if metrics_started:
                            if line.find(',') != -1:
                                line = line.replace(',', '')
                        f.write(line)
示例#17
0
def get_chrom_lengths(genome=None, fai_fpath=None):
    assert genome or fai_fpath, 'One of genome or fai_fpath should be not None: ' \
                                'genome=' + str(genome) + ' fai_fpath=' + str(fai_fpath)

    if not fai_fpath:
        check_genome(genome)
        fai_fpath = get_fai(genome)
    else:
        fai_fpath = verify_file(fai_fpath, is_critical=True)
        if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'):
            critical('Error: .fai or .fa is accepted.')

    chr_lengths = []

    if fai_fpath.endswith('.fa'):
        debug('Reading genome sequence (.fa) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            from Bio import SeqIO
            reference_records = SeqIO.parse(handle, 'fasta')
            for record in reference_records:
                chrom = record.id
                chr_lengths.append((chrom, len(record.seq)))

    else:
        debug('Reading genome index file (.fai) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            for line in handle:
                line = line.strip()
                if line:
                    chrom, length = line.split()[0], int(line.split()[1])
                    chr_lengths.append((chrom, length))

    return chr_lengths
示例#18
0
def read_samples(args):
    bam_by_sample = find_bams(args)
    if bam_by_sample:
        info('Found ' + str(len(bam_by_sample)) + ' BAM file' +
             ('s' if len(bam_by_sample) > 1 else ''))

    input_not_bam = [
        verify_file(fpath) for fpath in args
        if adjust_path(fpath) not in bam_by_sample
    ]
    input_not_bam = [fpath for fpath in input_not_bam if fpath]
    fastqs_by_sample = dict()
    if not input_not_bam and not bam_by_sample:
        critical('No correct input files')
    if input_not_bam:
        info('Input ' + str(len(input_not_bam)) +
             ' correct input non-BAM files')
        fastqs_by_sample = find_fastq_pairs(input_not_bam)
        if fastqs_by_sample:
            info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs')
        intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys())
        if intersection:
            critical('The following samples both had input BAMs and FastQ: ' +
                     ', '.join(list(intersection)))

    return fastqs_by_sample, bam_by_sample
示例#19
0
def make_tarqc_html_report(output_dir,
                           work_dir,
                           samples,
                           bed_fpath=None,
                           tag_by_sample=None):
    # header_storage = get_header_metric_storage(tc.depth_thresholds,
    #                                            is_wgs=bed_fpath is not None,
    #                                            padding=tc.padding)

    jsons_by_sample = {
        s.name: s.targqc_json_fpath
        for s in samples if verify_file(s.targqc_json_fpath)
    }
    # htmls_by_sample = {s.name: s.targqc_html_fpath for s in samples if verify_file(s.targqc_html_fpath)}
    htmls_by_sample = dict()

    if not jsons_by_sample:
        return None, None, None

    targqc_full_report = FullReport.construct_from_sample_report_jsons(
        samples, output_dir, jsons_by_sample, htmls_by_sample)

    for sample_report in targqc_full_report.sample_reports:
        if tag_by_sample:
            sample_report.set_project_tag(
                tag_by_sample[sample_report.sample.name])
        if verify_file(sample_report.sample.qualimap_html_fpath):
            url = relpath(sample_report.sample.qualimap_html_fpath, output_dir)
            r = sample_report.find_record(sample_report.records, 'Qualimap')
            if r:
                r.url = url
            else:
                sample_report.add_record(metric_name='Qualimap',
                                         value='Qualimap',
                                         url=url,
                                         silent=True)

    if len(samples) > 1:
        run_multisample_qualimap(output_dir, work_dir, samples,
                                 targqc_full_report)

    fn = splitext(basename(samples[0].targqc_txt_fpath))[0]
    tsv_fpath = targqc_full_report.save_tsv(join(output_dir, fn + '.tsv'))
    html_fpath = targqc_full_report.save_html(join(output_dir, fn + '.html'),
                                              'TargQC')

    return tsv_fpath, html_fpath
示例#20
0
 def _check_dir_not_empty(dirpath, description=None):
     assert verify_dir(dirpath, description=description), dirpath
     contents = [join(dirpath, fname) for fname in os.listdir(dirpath)
                 if not fname.startswith('.')]
     assert len(contents) >= 1, dirpath + ': ' + str(contents)
     assert all(verify_file(realpath(fpath), is_critical=True)
                for fpath in contents
                if isfile(realpath(fpath))), dirpath + ': ' + str(contents)
示例#21
0
    def _make_qualimap_bed(self, work_dir):
        if self.is_wgs:
            return None

        self.qualimap_bed_fpath = intermediate_fname(work_dir, self.capture_bed_fpath, 'qualimap_ready')
        if can_reuse(self.qualimap_bed_fpath, self.capture_bed_fpath):
            return self.qualimap_bed_fpath

        debug('Merging and saving BED into required bed6 format for Qualimap')
        bed = self.bed.sort().merge()
        with file_transaction(work_dir, self.qualimap_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for i, region in enumerate(x for x in bed):
                    region = [x for x in list(region) if x]
                    fillers = [str(i), "1.0", "+"]
                    full = region + fillers[:6 - len(region)]
                    out.write("\t".join(full) + "\n")
        verify_file(self.qualimap_bed_fpath, is_critical=True)
        return self.qualimap_bed_fpath
示例#22
0
def run_qualimap(work_dir,
                 output_dir,
                 output_fpaths,
                 bam_fpath,
                 genome,
                 bed_fpath=None,
                 threads=1):
    info('Analysing ' + bam_fpath)

    safe_mkdir(dirname(output_dir))
    safe_mkdir(output_dir)

    mem_cmdl = ''
    mem_m = get_qualimap_max_mem(bam_fpath)
    mem = str(int(mem_m)) + 'M'
    mem_cmdl = '--java-mem-size=' + mem

    cmdline = (find_executable() +
               ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 '
               '-bam {bam_fpath} -outdir {output_dir}')

    if genome.startswith('hg') or genome.startswith('GRCh'):
        cmdline += ' -gd HUMAN'
    if genome.startswith('mm'):
        cmdline += ' -gd MOUSE'

    if bed_fpath:
        cmdline += ' -gff {bed_fpath}'
        debug('Using amplicons/capture panel ' + bed_fpath)

    cmdline = cmdline.format(**locals())
    if not all(
            can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath])
            for fp in output_fpaths):
        for fp in output_fpaths:
            if isfile(fp):
                os.remove(fp)
        try:
            run(cmdline, env_vars=dict(DISPLAY=None))
        except subprocess.CalledProcessError as e:
            if 'The alignment file is unsorted.' in e.output:
                info()
                info('BAM file is unsorted; trying to sort and rerun QualiMap')
                sorted_bam_fpath = sort_bam(bam_fpath)
                cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath)
                run(cmdline, env_vars=dict(DISPLAY=None))

    if not all(
            verify_file(
                fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath])
            for fp in output_fpaths):
        critical('Some of the QualiMap results were not generated')

    return output_dir
示例#23
0
    def _make_qualimap_bed(self, work_dir):
        if self.is_wgs:
            return None

        self.qualimap_bed_fpath = intermediate_fname(work_dir,
                                                     self.capture_bed_fpath,
                                                     'qualimap_ready')
        if can_reuse(self.qualimap_bed_fpath, self.capture_bed_fpath):
            return self.qualimap_bed_fpath

        debug('Merging and saving BED into required bed6 format for Qualimap')
        bed = self.bed.sort().merge()
        with file_transaction(work_dir, self.qualimap_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for i, region in enumerate(x for x in bed):
                    region = [x for x in list(region) if x]
                    fillers = [str(i), "1.0", "+"]
                    full = region + fillers[:6 - len(region)]
                    out.write("\t".join(full) + "\n")
        verify_file(self.qualimap_bed_fpath, is_critical=True)
        return self.qualimap_bed_fpath
示例#24
0
def merge_overlaps(work_dir, bed_fpath, distance=None):
    """Merge bed file intervals to avoid overlapping regions.
    Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes
    that don't collapse BEDs prior to using them.
    """
    output_fpath = intermediate_fname(work_dir, bed_fpath, 'merged')
    if isfile(output_fpath) and verify_file(output_fpath, cmp_f=bed_fpath):
        return output_fpath

    with file_transaction(work_dir, output_fpath) as tx:
        kwargs = dict(d=distance) if distance else dict()
        BedTool(bed_fpath).merge(**kwargs).saveas(tx)
    return output_fpath
示例#25
0
def read_biomart(genome_name):
    features_by_ens_id = dict()
    bm_fpath = ebl.biomart_fpath(genome_name)
    if not verify_file(bm_fpath):
        warn('Warning: biomart file for genome ' + genome_name + ' not found, skip using the TSL values')
        return dict()
    
    with open(bm_fpath) as f:
        for r in csv.DictReader(f, delimiter='\t'):
            features_by_ens_id[r['Transcript ID']] = r
    
    # hg38 version has TSL, checking if we can populate some TSL from it
    if not genome_name.startswith('hg38'):
        bm_fpath = ebl.biomart_fpath('hg38')
        if not verify_file(bm_fpath): critical('Biomart for hg38 file not found, and needed for TSL values')
        with open(bm_fpath) as f:
            for r in csv.DictReader(f, delimiter='\t'):
                if r['Transcript ID'] not in features_by_ens_id:
                    features_by_ens_id[r['Transcript ID']] = r
                else:
                    features_by_ens_id[r['Transcript ID']]['Transcript Support Level (TSL)'] = r[
                        'Transcript Support Level (TSL)']
    return features_by_ens_id
示例#26
0
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'):
    info(
        'Converting the BAM to BED to save some memory.'
    )  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz'
    cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format(
        **locals())
    info(cmdline)
    os.system(cmdline)
    bam_bed_fpath = verify_file(bam_bed_fpath)
    if bam_bed_fpath:
        info('Done, saved to ' + bam_bed_fpath)
    else:
        err('Error, result is non-existent or empty')
    return bam_bed_fpath
示例#27
0
def verify_bam(fpath, description='', is_critical=False, silent=False):
    if not verify_file(fpath, description, is_critical=is_critical, silent=silent):
        return None

    fpath = adjust_path(fpath)

    logfn = critical if is_critical else err
    if not fpath.endswith('.bam'):
        logfn('The file ' + fpath + ' is supposed to be BAM but does not have the .bam '
            'extension. Please, make sure you pass proper file.')
        return None

    # TODO: check if binary

    return fpath
示例#28
0
def get_canonical_transcripts_ids(genome):
    short_genome = genome.split('-')[0]
    if short_genome.startswith('GRCh37'):
        short_genome = 'hg19'
    if short_genome.startswith('GRCh38'):
        short_genome = 'hg38'
    check_genome(short_genome)

    canon_fpath = _get(join('{genome}', 'canon_transcripts_{genome}_ensembl.txt'), genome)
    replacement_fpath = _get('canon_cancer_replacement.txt')

    canon_fpath = verify_file(canon_fpath, description='Canonical transcripts path')
    replacement_fpath = verify_file(replacement_fpath, description='Canonical cancer transcripts replacement path')

    if not canon_fpath:
        return None
    with open(canon_fpath) as f:
        canon_tx_by_gname = dict(l.strip('\n').split('\t') for l in f)
    if replacement_fpath:
        with open(replacement_fpath) as f:
            for gname, tx_id in (l.strip('\n').split('\t') for l in f):
                canon_tx_by_gname[gname] = tx_id

    return canon_tx_by_gname
示例#29
0
def run_multisample_qualimap(output_dir, work_dir, samples, targqc_full_report):
    """ 1. Generates Qualimap2 plots and put into plots_dirpath
        2. Adds records to targqc_full_report.plots
    """
    plots_dirpath = join(output_dir, 'plots')
    individual_report_fpaths = [s.qualimap_html_fpath for s in samples]
    if isdir(plots_dirpath) and not any(
            not can_reuse(join(plots_dirpath, f), individual_report_fpaths)
            for f in listdir(plots_dirpath) if not f.startswith('.')):
        debug('Qualimap miltisample plots exist - ' + plots_dirpath + ', reusing...')
    else:
        # Qualimap2 run for multi-sample plots
        if len([s.qualimap_html_fpath for s in samples if s.qualimap_html_fpath]) > 0:
            if find_executable() is not None:  # and get_qualimap_type(find_executable()) == 'full':
                qualimap_output_dir = join(work_dir, 'qualimap_multi_bamqc')

                _correct_qualimap_genome_results(samples)
                _correct_qualimap_insert_size_histogram(samples)

                safe_mkdir(qualimap_output_dir)
                rows = []
                for sample in samples:
                    if sample.qualimap_html_fpath:
                        rows += [[sample.name, sample.qualimap_html_fpath]]

                data_fpath = write_tsv_rows(([], rows), join(qualimap_output_dir, 'qualimap_results_by_sample.tsv'))
                qualimap_plots_dirpath = join(qualimap_output_dir, 'images_multisampleBamQcReport')
                cmdline = find_executable() + ' multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format(**locals())
                run(cmdline, env_vars=dict(DISPLAY=None),
                    checks=[lambda _1, _2: verify_dir(qualimap_output_dir)], reuse=cfg.reuse_intermediate)

                if not verify_dir(qualimap_plots_dirpath):
                    warn('Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.')
                    return None
                else:
                    if exists(plots_dirpath):
                        shutil.rmtree(plots_dirpath)
                    shutil.move(qualimap_plots_dirpath, plots_dirpath)
            else:
                warn('Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.')
                return None

    targqc_full_report.plots = []
    for plot_fpath in listdir(plots_dirpath):
        plot_fpath = join(plots_dirpath, plot_fpath)
        if verify_file(plot_fpath) and plot_fpath.endswith('.png'):
            targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
示例#30
0
def verify_bed(bed, description='', is_critical=False, silent=False):
    if isinstance(bed, BedTool):
        return bed

    fpath = adjust_path(bed)
    if not verify_file(
            fpath, description, is_critical=is_critical, silent=silent):
        return None

    error = BedFile(fpath).checkformat()
    if error:
        fn = critical if is_critical else err
        fn('Error: incorrect bed file format (' + fpath + '): ' + str(error) +
           '\n')
        return None

    return fpath
示例#31
0
def verify_bam(fpath, description='', is_critical=False, silent=False):
    if not verify_file(
            fpath, description, is_critical=is_critical, silent=silent):
        return None

    fpath = adjust_path(fpath)

    logfn = critical if is_critical else err
    if not fpath.endswith('.bam'):
        logfn('The file ' + fpath +
              ' is supposed to be BAM but does not have the .bam '
              'extension. Please, make sure you pass proper file.')
        return None

    # TODO: check if binary

    return fpath
示例#32
0
def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1):
    info('Analysing ' + bam_fpath)

    safe_mkdir(dirname(output_dir))
    safe_mkdir(output_dir)

    mem_cmdl = ''
    mem_m = get_qualimap_max_mem(bam_fpath)
    mem = str(int(mem_m)) + 'M'
    mem_cmdl = '--java-mem-size=' + mem

    cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 '
        '-bam {bam_fpath} -outdir {output_dir}')

    if genome.startswith('hg') or genome.startswith('GRCh'):
        cmdline += ' -gd HUMAN'
    if genome.startswith('mm'):
        cmdline += ' -gd MOUSE'

    if bed_fpath:
        cmdline += ' -gff {bed_fpath}'
        debug('Using amplicons/capture panel ' + bed_fpath)

    cmdline = cmdline.format(**locals())
    if not all(can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths):
        for fp in output_fpaths:
            if isfile(fp):
                os.remove(fp)
        try:
            run(cmdline, env_vars=dict(DISPLAY=None))
        except subprocess.CalledProcessError as e:
            if 'The alignment file is unsorted.' in e.output:
                info()
                info('BAM file is unsorted; trying to sort and rerun QualiMap')
                sorted_bam_fpath = sort_bam(bam_fpath)
                cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath)
                run(cmdline, env_vars=dict(DISPLAY=None))

    if not all(verify_file(fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths):
        critical('Some of the QualiMap results were not generated')

    return output_dir
示例#33
0
    def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug,
                         padding=None, fai_fpath=None, genome=None, reannotate=False):
        clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')
        if not can_reuse(clean_target_bed_fpath, bed_fpath):
            debug()
            debug('Cleaning target BED file...')
            bed = BedTool(bed_fpath)
            if bed.field_count() > 4:
                bed = bed.cut(range(4))
            bed = bed\
                .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\
                .remove_invalid()
            with file_transaction(work_dir, clean_target_bed_fpath) as tx:
                bed.saveas(tx)
            debug('Saved to ' + clean_target_bed_fpath)
            verify_file(clean_target_bed_fpath, is_critical=True)

        sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted')
        if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath):
            debug()
            debug('Sorting target BED file...')
            sort_target_bed_fpath = sort_bed(clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath)
            debug('Saved to ' + sort_target_bed_fpath)
            verify_file(sort_target_bed_fpath, is_critical=True)

        if genome in ebl.SUPPORTED_GENOMES:
            ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features')
            if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath):
                debug()
                if BedTool(sort_target_bed_fpath).field_count() == 3 or reannotate:
                    debug('Annotating target BED file and collecting overlapping genome features')
                    overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir,
                         genome=genome, extended=True, reannotate=reannotate, only_canonical=True)
                else:
                    debug('Overlapping with genomic features:')
                    overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir,
                         genome=genome, extended=True, only_canonical=True)
                debug('Saved to ' + ann_target_bed_fpath)
                verify_file(ann_target_bed_fpath, is_critical=True)
        else:
            ann_target_bed_fpath = sort_target_bed_fpath

        final_clean_target_bed_fpath = intermediate_fname(work_dir, ann_target_bed_fpath, 'clean')
        if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath):
            bed = BedTool(ann_target_bed_fpath).remove_invalid()
            with file_transaction(work_dir, final_clean_target_bed_fpath) as tx:
                bed.saveas(tx)
                pass
            verify_file(final_clean_target_bed_fpath, is_critical=True)

        self.bed_fpath = final_clean_target_bed_fpath
        self.bed = BedTool(self.bed_fpath)
        
        self.capture_bed_fpath = add_suffix(join(output_dir, basename(bed_fpath)), 'clean_sorted_ann')
        if not can_reuse(self.capture_bed_fpath, self.bed_fpath):
            with file_transaction(work_dir, self.capture_bed_fpath) as tx:
                self.get_capture_bed().saveas(tx)

        gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath)
        self.gene_keys_set = gene_key_set
        self.gene_keys_list = gene_key_list
        self.regions_num = self.get_capture_bed().count()

        self._make_qualimap_bed(work_dir)
        if padding:
            self._make_padded_bed(work_dir, fai_fpath, padding)
示例#34
0
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir):
    metric_storage = get_detailed_metric_storage(depth_threshs)

    report = PerRegionSampleReport(sample='Best', metric_storage=metric_storage)
    report.add_record('Sample', 'contains best values from all samples: ' + ', '.join([s.name for s in samples]))

    total_regions = 0
    fpaths = [s.targqc_region_tsv for s in samples if verify_file(s.targqc_region_tsv)]
    if not fpaths:
        err('No targetcov detailed per-gene report was generated; skipping.')
        return None

    open_tsv_files = [open(fpath) for fpath in fpaths]

    first_col = 0
    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break
        l = lines_for_each_sample[0]
        if l.startswith('##'):
            continue
        elif l.startswith('#'):
            if l.startswith('#Sample'):
                first_col = 1
            break

    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break

        if all([not l.startswith('#') and ('Whole-Gene' in l or 'Gene-Exon' in l) for l in lines_for_each_sample]):
            shared_fields = lines_for_each_sample[0].split('\t')[first_col:first_col+9]
            reg = report.add_row()
            reg.add_record('Chr', get_val(shared_fields[0]))
            reg.add_record('Start', get_int_val(shared_fields[1]))
            reg.add_record('End', get_int_val(shared_fields[2]))
            reg.add_record('Size', get_int_val(shared_fields[3]))
            reg.add_record('Gene', get_val(shared_fields[4]))
            reg.add_record('Strand', get_val(shared_fields[5]))
            reg.add_record('Feature', get_val(shared_fields[6]))
            reg.add_record('Biotype', get_val(shared_fields[7]))
            reg.add_record('Transcript', get_val(shared_fields[8]))

            min_depths, ave_depths, stddevs, withins = ([], [], [], [])
            percents_by_threshs = {t: [] for t in depth_threshs}

            for l in lines_for_each_sample:
                fs = l.split('\t')

                min_depths.append(get_int_val(fs[first_col+9]))
                ave_depths.append(get_float_val(fs[first_col+10]))
                stddevs.append(get_float_val(fs[first_col+11]))
                withins.append(get_float_val(fs[first_col+12]))
                for t, f in zip(depth_threshs, fs[first_col+13:]):
                    percents_by_threshs[t].append(get_float_val(f))

            # counting bests
            reg.add_record('Min depth', select_best(min_depths))
            reg.add_record('Ave depth', select_best(ave_depths))
            reg.add_record('Std dev', select_best(stddevs, max))
            reg.add_record('W/n 20% of median depth', select_best(withins))
            for t in depth_threshs:
                reg.add_record('{}x'.format(t), select_best(percents_by_threshs[t]))

            total_regions += 1

    for f in open_tsv_files:
        f.close()

    gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best')
    txt_rep_fpath = report.save_txt(join(output_dir, gene_report_basename + '.txt'))
    tsv_rep_fpath = report.save_tsv(join(output_dir, gene_report_basename + '.tsv'))
    info('')
    info('Best values for the regions (total ' + str(total_regions) + ') saved into:')
    info('  ' + txt_rep_fpath)

    return txt_rep_fpath
示例#35
0
def main():
    description = '''
Usage:
    ' + __file__ + ' hg19 [db.gtf]
'''

    options = [
        (['--debug'], dict(dest='debug', action='store_true', default=False)),
    ]
    parser = OptionParser(description=description)
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()
    if len(args) == 0:
        parser.exit(1, 'Please provide genome name as the first argument')
    logger.is_debug = opts.debug

    genome_name = args[0]

    if len(args) > 1:
        gtf_fpath = args[1]
    else:
        gtf_fpath = ebl.ensembl_gtf_fpath(genome_name)
    if not isfile(gtf_fpath):
        if not gtf_fpath.endswith('.gz'):
            gtf_fpath += '.gz'
    gtf_fpath = verify_file(gtf_fpath)
    debug('Reading the GTF database')
    db = gtf.get_gtf_db(gtf_fpath)

    debug('Reading biomart data')
    features_by_ens_id = read_biomart(genome_name)

    chroms = [c for c, l in ref.get_chrom_lengths(genome_name)]

    output_fpath = join(dirname(__file__), genome_name, 'ensembl.bed')
    unsorted_output_fpath = add_suffix(output_fpath, 'unsorted')
    debug('Processing features, writing to ' + unsorted_output_fpath)

    def _get(_rec, _key):
        val = _rec.attributes.get(_key)
        if val is None:
            return None
        assert len(val) == 1, (_key, str(val))
        return val[0]

    num_tx_not_in_biomart = 0
    num_tx_diff_gene_in_biomart = 0
    with open(unsorted_output_fpath, 'w') as out:
        out.write('\t'.join(ebl.BedCols.names[i]
                            for i in ebl.BedCols.cols[:-4]) + '\n')

        for rec in db.all_features(order_by=('seqid', 'start', 'end')):
            if rec.featuretype == 'gene': continue
            if rec.chrom not in chroms: continue
            if rec.end - rec.start < 0: continue

            tx_id = _get(rec, 'transcript_id')
            gname = _get(rec, 'gene_name')
            tx_biotype = _get(rec, 'transcript_biotype')
            if not tx_biotype: tx_biotype = _get(rec, 'gene_biotype')
            tsl = _get(rec, 'transcript_support_level')
            hugo_gene = None

            biomart_rec = features_by_ens_id.get(tx_id)
            if not biomart_rec:
                if rec.featuretype == 'transcript':
                    num_tx_not_in_biomart += 1
            else:
                bm_gname = biomart_rec['Associated Gene Name']
                bm_tx_biotype = biomart_rec['Transcript type']
                bm_tsl = biomart_rec.get('Transcript Support Level (TSL)')
                hugo_gene = biomart_rec['HGNC symbol']
                if bm_gname != gname:
                    if rec.featuretype == 'transcript':
                        num_tx_diff_gene_in_biomart += 1
                    continue
                tx_biotype = bm_tx_biotype
                tsl = bm_tsl.split()[0].replace('tsl', '') if bm_tsl else None

            fs = [None] * len(ebl.BedCols.cols[:-3])
            if not rec.chrom.startswith('chr'):
                rec.chrom = 'chr' + rec.chrom.replace('MT', 'M')
            fs[:6] = [
                rec.chrom,
                str(rec.start - 1),
                str(rec.end), gname,
                rec.attributes.get('exon_number', ['.'])[0], rec.strand
            ]
            fs[ebl.BedCols.FEATURE] = rec.featuretype or '.'
            fs[ebl.BedCols.BIOTYPE] = tx_biotype or '.'
            fs[ebl.BedCols.ENSEMBL_ID] = tx_id or '.'
            # fs[ebl.BedCols.REFSEQ_ID] = refseq_id or '.'
            # fs[ebl.BedCols.IS_CANONICAL] = 'canonical' if refseq_id in canonical_transcripts_ids else ''
            fs[ebl.BedCols.TSL] = tsl or '.'
            fs[ebl.BedCols.HUGO] = hugo_gene or '.'
            # fs[ebl.BedCols.names[ensembl.BedCols.GC]] = gc
            out.write('\t'.join(fs) + '\n')

    if num_tx_not_in_biomart:
        warn(str(num_tx_not_in_biomart) + ' transcripts not found in biomart')
    if num_tx_diff_gene_in_biomart:
        warn(
            str(num_tx_diff_gene_in_biomart) +
            ' transcripts have a different gene name in biomart')

    debug('Sorting results')
    sort_bed(unsorted_output_fpath,
             output_fpath,
             fai_fpath=ref.get_fai(genome_name),
             genome=genome_name)
    os.remove(unsorted_output_fpath)
    bgzip_and_tabix(output_fpath)
示例#36
0
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir):
    metric_storage = get_detailed_metric_storage(depth_threshs)

    report = PerRegionSampleReport(sample='Best',
                                   metric_storage=metric_storage)
    report.add_record(
        'Sample', 'contains best values from all samples: ' +
        ', '.join([s.name for s in samples]))

    total_regions = 0
    fpaths = [
        s.targqc_region_tsv for s in samples
        if verify_file(s.targqc_region_tsv)
    ]
    if not fpaths:
        err('No targetcov detailed per-gene report was generated; skipping.')
        return None

    open_tsv_files = [open(fpath) for fpath in fpaths]

    first_col = 0
    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break
        l = lines_for_each_sample[0]
        if l.startswith('##'):
            continue
        elif l.startswith('#'):
            if l.startswith('#Sample'):
                first_col = 1
            break

    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break

        if all([
                not l.startswith('#')
                and ('Whole-Gene' in l or 'Gene-Exon' in l)
                for l in lines_for_each_sample
        ]):
            shared_fields = lines_for_each_sample[0].split(
                '\t')[first_col:first_col + 9]
            reg = report.add_row()
            reg.add_record('Chr', get_val(shared_fields[0]))
            reg.add_record('Start', get_int_val(shared_fields[1]))
            reg.add_record('End', get_int_val(shared_fields[2]))
            reg.add_record('Size', get_int_val(shared_fields[3]))
            reg.add_record('Gene', get_val(shared_fields[4]))
            reg.add_record('Strand', get_val(shared_fields[5]))
            reg.add_record('Feature', get_val(shared_fields[6]))
            reg.add_record('Biotype', get_val(shared_fields[7]))
            reg.add_record('Transcript', get_val(shared_fields[8]))

            min_depths, ave_depths, stddevs, withins = ([], [], [], [])
            percents_by_threshs = {t: [] for t in depth_threshs}

            for l in lines_for_each_sample:
                fs = l.split('\t')

                min_depths.append(get_int_val(fs[first_col + 9]))
                ave_depths.append(get_float_val(fs[first_col + 10]))
                stddevs.append(get_float_val(fs[first_col + 11]))
                withins.append(get_float_val(fs[first_col + 12]))
                for t, f in zip(depth_threshs, fs[first_col + 13:]):
                    percents_by_threshs[t].append(get_float_val(f))

            # counting bests
            reg.add_record('Min depth', select_best(min_depths))
            reg.add_record('Ave depth', select_best(ave_depths))
            reg.add_record('Std dev', select_best(stddevs, max))
            reg.add_record('W/n 20% of median depth', select_best(withins))
            for t in depth_threshs:
                reg.add_record('{}x'.format(t),
                               select_best(percents_by_threshs[t]))

            total_regions += 1

    for f in open_tsv_files:
        f.close()

    gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best')
    txt_rep_fpath = report.save_txt(
        join(output_dir, gene_report_basename + '.txt'))
    tsv_rep_fpath = report.save_tsv(
        join(output_dir, gene_report_basename + '.tsv'))
    info('')
    info('Best values for the regions (total ' + str(total_regions) +
         ') saved into:')
    info('  ' + txt_rep_fpath)

    return txt_rep_fpath
示例#37
0
def run_multisample_qualimap(output_dir, work_dir, samples,
                             targqc_full_report):
    """ 1. Generates Qualimap2 plots and put into plots_dirpath
        2. Adds records to targqc_full_report.plots
    """
    plots_dirpath = join(output_dir, 'plots')
    individual_report_fpaths = [s.qualimap_html_fpath for s in samples]
    if isdir(plots_dirpath) and not any(
            not can_reuse(join(plots_dirpath, f), individual_report_fpaths)
            for f in listdir(plots_dirpath) if not f.startswith('.')):
        debug('Qualimap miltisample plots exist - ' + plots_dirpath +
              ', reusing...')
    else:
        # Qualimap2 run for multi-sample plots
        if len(
            [s.qualimap_html_fpath
             for s in samples if s.qualimap_html_fpath]) > 0:
            if find_executable(
            ) is not None:  # and get_qualimap_type(find_executable()) == 'full':
                qualimap_output_dir = join(work_dir, 'qualimap_multi_bamqc')

                _correct_qualimap_genome_results(samples)
                _correct_qualimap_insert_size_histogram(samples)

                safe_mkdir(qualimap_output_dir)
                rows = []
                for sample in samples:
                    if sample.qualimap_html_fpath:
                        rows += [[sample.name, sample.qualimap_html_fpath]]

                data_fpath = write_tsv_rows(
                    ([], rows),
                    join(qualimap_output_dir,
                         'qualimap_results_by_sample.tsv'))
                qualimap_plots_dirpath = join(qualimap_output_dir,
                                              'images_multisampleBamQcReport')
                cmdline = find_executable(
                ) + ' multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format(
                    **locals())
                run(cmdline,
                    env_vars=dict(DISPLAY=None),
                    checks=[lambda _1, _2: verify_dir(qualimap_output_dir)],
                    reuse=cfg.reuse_intermediate)

                if not verify_dir(qualimap_plots_dirpath):
                    warn(
                        'Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.'
                    )
                    return None
                else:
                    if exists(plots_dirpath):
                        shutil.rmtree(plots_dirpath)
                    shutil.move(qualimap_plots_dirpath, plots_dirpath)
            else:
                warn(
                    'Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.'
                )
                return None

    targqc_full_report.plots = []
    for plot_fpath in listdir(plots_dirpath):
        plot_fpath = join(plots_dirpath, plot_fpath)
        if verify_file(plot_fpath) and plot_fpath.endswith('.png'):
            targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
示例#38
0
def main():
    description = '''
Usage:
    ' + __file__ + ' hg19 [db.gtf]
'''

    options = [
        (['--debug'], dict(dest='debug', action='store_true', default=False)),
    ]
    parser = OptionParser(description=description)
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()
    if len(args) == 0:
        parser.exit(1, 'Please provide genome name as the first argument')
    logger.is_debug = opts.debug

    genome_name = args[0]

    if len(args) > 1:
        gtf_fpath = args[1]
    else:
        gtf_fpath = ebl.ensembl_gtf_fpath(genome_name)
    if not isfile(gtf_fpath):
        if not gtf_fpath.endswith('.gz'):
            gtf_fpath += '.gz'
    gtf_fpath = verify_file(gtf_fpath)
    debug('Reading the GTF database')
    db = gtf.get_gtf_db(gtf_fpath)

    debug('Reading biomart data')
    features_by_ens_id = read_biomart(genome_name)

    chroms = [c for c, l in ref.get_chrom_lengths(genome_name)]
    
    output_fpath = join(dirname(__file__), genome_name, 'ensembl.bed')
    unsorted_output_fpath = add_suffix(output_fpath, 'unsorted')
    debug('Processing features, writing to ' + unsorted_output_fpath)

    def _get(_rec, _key):
        val = _rec.attributes.get(_key)
        if val is None:
            return None
        assert len(val) == 1, (_key, str(val))
        return val[0]

    num_tx_not_in_biomart = 0
    num_tx_diff_gene_in_biomart = 0
    with open(unsorted_output_fpath, 'w') as out:
        out.write('\t'.join(ebl.BedCols.names[i] for i in ebl.BedCols.cols[:-4]) + '\n')

        for rec in db.all_features(order_by=('seqid', 'start', 'end')):
            if rec.featuretype == 'gene': continue
            if rec.chrom not in chroms: continue
            if rec.end - rec.start < 0: continue
            
            tx_id = _get(rec, 'transcript_id')
            gname = _get(rec, 'gene_name')
            tx_biotype = _get(rec, 'transcript_biotype')
            if not tx_biotype: tx_biotype = _get(rec, 'gene_biotype')
            tsl = _get(rec, 'transcript_support_level')
            hugo_gene = None

            biomart_rec = features_by_ens_id.get(tx_id)
            if not biomart_rec:
                if rec.featuretype == 'transcript':
                    num_tx_not_in_biomart += 1
            else:
                bm_gname = biomart_rec['Associated Gene Name']
                bm_tx_biotype = biomart_rec['Transcript type']
                bm_tsl = biomart_rec.get('Transcript Support Level (TSL)')
                hugo_gene = biomart_rec['HGNC symbol']
                if bm_gname != gname:
                    if rec.featuretype == 'transcript':
                        num_tx_diff_gene_in_biomart += 1
                    continue
                tx_biotype = bm_tx_biotype
                tsl = bm_tsl.split()[0].replace('tsl', '') if bm_tsl else None

            fs = [None] * len(ebl.BedCols.cols[:-3])
            if not rec.chrom.startswith('chr'):
                rec.chrom = 'chr' + rec.chrom.replace('MT', 'M')
            fs[:6] = [rec.chrom,
                      str(rec.start - 1),
                      str(rec.end),
                      gname,
                      rec.attributes.get('exon_number', ['.'])[0],
                      rec.strand]
            fs[ebl.BedCols.FEATURE] = rec.featuretype or '.'
            fs[ebl.BedCols.BIOTYPE] = tx_biotype or '.'
            fs[ebl.BedCols.ENSEMBL_ID] = tx_id or '.'
            # fs[ebl.BedCols.REFSEQ_ID] = refseq_id or '.'
            # fs[ebl.BedCols.IS_CANONICAL] = 'canonical' if refseq_id in canonical_transcripts_ids else ''
            fs[ebl.BedCols.TSL] = tsl or '.'
            fs[ebl.BedCols.HUGO] = hugo_gene or '.'
            # fs[ebl.BedCols.names[ensembl.BedCols.GC]] = gc
            out.write('\t'.join(fs) + '\n')

    if num_tx_not_in_biomart:
        warn(str(num_tx_not_in_biomart) + ' transcripts not found in biomart')
    if num_tx_diff_gene_in_biomart:
        warn(str(num_tx_diff_gene_in_biomart) + ' transcripts have a different gene name in biomart')

    debug('Sorting results')
    sort_bed(unsorted_output_fpath, output_fpath, fai_fpath=ref.get_fai(genome_name), genome=genome_name)
    os.remove(unsorted_output_fpath)
    bgzip_and_tabix(output_fpath)
示例#39
0
    def _make_target_bed(self,
                         bed_fpath,
                         work_dir,
                         output_dir,
                         is_debug,
                         padding=None,
                         fai_fpath=None,
                         genome=None,
                         reannotate=False):
        clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath,
                                                    'clean')
        if not can_reuse(clean_target_bed_fpath, bed_fpath):
            debug()
            debug('Cleaning target BED file...')
            bed = BedTool(bed_fpath)
            if bed.field_count() > 4:
                bed = bed.cut(range(4))
            bed = bed\
                .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\
                .remove_invalid()
            with file_transaction(work_dir, clean_target_bed_fpath) as tx:
                bed.saveas(tx)
            debug('Saved to ' + clean_target_bed_fpath)
            verify_file(clean_target_bed_fpath, is_critical=True)

        sort_target_bed_fpath = intermediate_fname(work_dir,
                                                   clean_target_bed_fpath,
                                                   'sorted')
        if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath):
            debug()
            debug('Sorting target BED file...')
            sort_target_bed_fpath = sort_bed(
                clean_target_bed_fpath,
                output_bed_fpath=sort_target_bed_fpath,
                fai_fpath=fai_fpath)
            debug('Saved to ' + sort_target_bed_fpath)
            verify_file(sort_target_bed_fpath, is_critical=True)

        if genome in ebl.SUPPORTED_GENOMES:
            ann_target_bed_fpath = intermediate_fname(work_dir,
                                                      sort_target_bed_fpath,
                                                      'ann_plus_features')
            if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath):
                debug()
                if BedTool(sort_target_bed_fpath).field_count(
                ) == 3 or reannotate:
                    debug(
                        'Annotating target BED file and collecting overlapping genome features'
                    )
                    overlap_with_features(sort_target_bed_fpath,
                                          ann_target_bed_fpath,
                                          work_dir=work_dir,
                                          genome=genome,
                                          extended=True,
                                          reannotate=reannotate,
                                          only_canonical=True)
                else:
                    debug('Overlapping with genomic features:')
                    overlap_with_features(sort_target_bed_fpath,
                                          ann_target_bed_fpath,
                                          work_dir=work_dir,
                                          genome=genome,
                                          extended=True,
                                          only_canonical=True)
                debug('Saved to ' + ann_target_bed_fpath)
                verify_file(ann_target_bed_fpath, is_critical=True)
        else:
            ann_target_bed_fpath = sort_target_bed_fpath

        final_clean_target_bed_fpath = intermediate_fname(
            work_dir, ann_target_bed_fpath, 'clean')
        if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath):
            bed = BedTool(ann_target_bed_fpath).remove_invalid()
            with file_transaction(work_dir,
                                  final_clean_target_bed_fpath) as tx:
                bed.saveas(tx)
                pass
            verify_file(final_clean_target_bed_fpath, is_critical=True)

        self.bed_fpath = final_clean_target_bed_fpath
        self.bed = BedTool(self.bed_fpath)

        self.capture_bed_fpath = add_suffix(
            join(output_dir, basename(bed_fpath)), 'clean_sorted_ann')
        if not can_reuse(self.capture_bed_fpath, self.bed_fpath):
            with file_transaction(work_dir, self.capture_bed_fpath) as tx:
                self.get_capture_bed().saveas(tx)

        gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath)
        self.gene_keys_set = gene_key_set
        self.gene_keys_list = gene_key_list
        self.regions_num = self.get_capture_bed().count()

        self._make_qualimap_bed(work_dir)
        if padding:
            self._make_padded_bed(work_dir, fai_fpath, padding)
示例#40
0
def parse_qualimap_results(sample):
    if not verify_file(sample.qualimap_html_fpath):
        critical('QualiMap report was not found')

    qualimap_value_by_metric = report_parser.parse_qualimap_sample_report(sample.qualimap_html_fpath)
    bases_by_depth, median_depth = parse_qualimap_coverage_hist(sample.qualimap_cov_hist_fpath)
    median_gc, median_human_gc = parse_qualimap_gc_content(sample.qualimap_gc_hist_fpath)
    median_ins_size = parse_qualimap_insert_size(sample.qualimap_ins_size_hist_fpath)

    def find_rec(name, percent=False, on_target=True):
        if on_target:
            name_on_target = name + ' (on target)'
            if percent:
                name_on_target += ' %'
            res = qualimap_value_by_metric.get(name_on_target)
            if res:
                return res
        if percent:
            name += ' %'
        return qualimap_value_by_metric.get(name)

    depth_stats = dict(
        ave_depth       = find_rec('Coverage Mean'),
        stddev_depth    = find_rec('Coverage Standard Deviation'),
        median_depth    = median_depth,
        bases_by_depth  = bases_by_depth
    )
    target_stats = dict(
        reference_size  = find_rec('Reference size'),
        target_size     = find_rec('Regions size/percentage of reference'),
        target_fraction = find_rec('Regions size/percentage of reference', percent=True),
    )
    reads_stats = dict(
        total                    = find_rec('Number of reads'),
        mapped                   = find_rec('Mapped reads', on_target=False),
        mapped_rate              = find_rec('Mapped reads', percent=True, on_target=False),
        unmapped                 = find_rec('Unmapped reads'),
        unmapped_rate            = find_rec('Unmapped reads', percent=True),
        mapped_on_target         = find_rec('Mapped reads (on target)'),
        mapped_rate_on_target    = find_rec('Mapped reads (on target)', percent=True),
        mapped_paired            = find_rec('Mapped paired reads', on_target=False),
        mapped_paired_rate       = find_rec('Mapped paired reads', percent=True, on_target=False),
        paired                   = find_rec('Paired reads'),
        paired_rate              = find_rec('Paired reads', percent=True),
        dup                      = find_rec('Duplicated reads (flagged)'),
        dup_rate                 = find_rec('Duplicated reads (flagged)', percent=True),
        min_len                  = find_rec('Read min length'),
        max_len                  = find_rec('Read max length'),
        ave_len                  = find_rec('Read mean length'),
        median_gc                = median_gc,
        median_human_gc          = median_human_gc,
        median_ins_size          = median_ins_size,
    )
    indels_stats = dict(
        mean_mq     = find_rec('Mean Mapping Quality'),
        mismatches  = find_rec('Mismatches'),
        insertions  = find_rec('Insertions'),
        deletions   = find_rec('Deletions'),
        homo_indels = find_rec('Homopolymer indels'),
    )
    return depth_stats, reads_stats, indels_stats, target_stats