Exemplo n.º 1
0
def open_gzipsafe(f, mode='r'):
    # mode_t = mode.replace('b', '')
    # mode_b = mode if 'b' in mode else mode + 'b'
    if f.endswith('.gz') or f.endswith('.gzip') or f.endswith(
            '.gz.tx') or f.endswith('.gzip.tx'):
        try:
            h = gzip.open(f, mode=mode + 't')
        except IOError as e:
            err('Error opening gzip ' + f + ': ' + str(e) +
                ', opening as plain text')
            return open(f, mode=mode)
        else:
            if 'w' in mode:
                return h
            else:
                try:
                    h.read(1)
                except IOError as e:
                    err('Error opening gzip ' + f + ': ' + str(e) +
                        ', opening as plain text')
                    h.close()
                    return open(f, mode=mode)
                else:
                    h.close()
                    h = gzip.open(f, mode=mode + 't')
                    return h
    else:
        return open(f, mode=mode)
Exemplo n.º 2
0
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs):
    gzipped_fpath = join(fpath + '.gz')
    tbi_fpath = gzipped_fpath + '.tbi'

    if reuse and \
           file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \
           file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath):
        info('Actual compressed file and index exist, reusing')
        return gzipped_fpath

    info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)')
    bgzip = which('bgzip')
    tabix = which('tabix')
    if not bgzip:
        err('Cannot index file because bgzip is not found')
    if not tabix:
        err('Cannot index file because tabix is not found')
    if not bgzip and not tabix:
        return fpath

    if isfile(gzipped_fpath):
        os.remove(gzipped_fpath)
    if isfile(tbi_fpath):
        os.remove(tbi_fpath)

    info('BGzipping ' + fpath)
    cmdline = '{bgzip} {fpath}'.format(**locals())
    call_process.run(cmdline)

    info('Tabixing ' + gzipped_fpath)
    cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals())
    call_process.run(cmdline)

    return gzipped_fpath
Exemplo n.º 3
0
def file_exists_check(output_fpath=None, input_fpath=None):
    if output_fpath is None:
        return True
    ok = os.path.exists(output_fpath)
    if not ok:
        err("Did not find output file {0}".format(output_fpath))
    return ok
Exemplo n.º 4
0
def file_nonempty_check(output_fpath=None, input_fpath=None):
    if output_fpath is None:
        return True
    ok = file_exists_check(output_fpath)
    if not ok:
        err("Did not find non-empty output file {0}".format(output_fpath))
    return ok
Exemplo n.º 5
0
def open_gzipsafe(f, mode='r'):
    # mode_t = mode.replace('b', '')
    # mode_b = mode if 'b' in mode else mode + 'b'
    if f.endswith('.gz') or f.endswith('.gzip') or f.endswith('.gz.tx') or f.endswith('.gzip.tx'):
        try:
            h = gzip.open(f, mode=mode + 't')
        except IOError as e:
            err('Error opening gzip ' + f + ': ' + str(e) + ', opening as plain text')
            return open(f, mode=mode)
        else:
            if 'w' in mode:
                return h
            else:
                try:
                    h.read(1)
                except IOError as e:
                    err('Error opening gzip ' + f + ': ' + str(e) + ', opening as plain text')
                    h.close()
                    return open(f, mode=mode)
                else:
                    h.close()
                    h = gzip.open(f, mode=mode + 't')
                    return h
    else:
        return open(f, mode=mode)
Exemplo n.º 6
0
def count_bed_cols(bed_fpath):
    with open(bed_fpath) as f:
        for l in f:
            if l and l.strip() and not l.startswith('#'):
                return len(l.split('\t'))
    # return len(next(dropwhile(lambda x: x.strip().startswith('#'), open(bed_fpath))).split('\t'))
    err('Empty bed file: ' + bed_fpath)
    return None
Exemplo n.º 7
0
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
    try:
        dic = load_yaml(open(fpath))
    except Exception:
        err(format_exc())
        critical('Could not parse bcbio YAML ' + fpath)
    else:
        return dic
Exemplo n.º 8
0
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
    try:
        dic = load_yaml(open(fpath))
    except Exception:
        err(format_exc())
        critical('Could not parse bcbio YAML ' + fpath)
    else:
        return dic
Exemplo n.º 9
0
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'):
    info(
        'Converting the BAM to BED to save some memory.'
    )  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz'
    cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format(
        **locals())
    info(cmdline)
    os.system(cmdline)
    bam_bed_fpath = verify_file(bam_bed_fpath)
    if bam_bed_fpath:
        info('Done, saved to ' + bam_bed_fpath)
    else:
        err('Error, result is non-existent or empty')
    return bam_bed_fpath
Exemplo n.º 10
0
def find_fastq_pairs(fpaths):
    info('Finding FastQ pairs...')
    fastqs_by_sample_name = dict()
    for fpath in fpaths:
        fn, ext = splitext_plus(basename(fpath))
        if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']:
            sname, l_fpath, r_fpath = None, None, None
            if fn.endswith('_1'):
                sname = fn[:-2]
                l_fpath = fpath
            if fn.endswith('_R1'):
                sname = fn[:-3]
                l_fpath = fpath
            if fn.endswith('_2'):
                sname = fn[:-2]
                r_fpath = fpath
            if fn.endswith('_R2'):
                sname = fn[:-3]
                r_fpath = fpath

            if sname:
                m = re.match(r'(.*)_S\d+', sname)
                if m:
                    sname = m.group(1)
                sname = sname.replace('-', '_')
            else:
                sname = fn
                info('Cannot detect file for ' + sname)

            l, r = fastqs_by_sample_name.get(sname, (None, None))
            if l and l_fpath:
                critical('Duplicated left FastQ files for ' + sname + ': ' +
                         l + ' and ' + l_fpath)
            if r and r_fpath:
                critical('Duplicated right FastQ files for ' + sname + ': ' +
                         r + ' and ' + r_fpath)
            fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath

    fixed_fastqs_by_sample_name = dict()
    for sname, (l, r) in fastqs_by_sample_name.items():
        if not l:
            err('ERROR: for sample ' + sname + ', left reads not found')
        if not r:
            err('ERROR: for sample ' + sname + ', right reads not found')
        if l and r:
            fixed_fastqs_by_sample_name[sname] = l, r

    return fixed_fastqs_by_sample_name
Exemplo n.º 11
0
 def run(self, fn, param_lists):
     if self.n_samples == 0:
         return []
     assert self.n_samples == len(param_lists)
     n_params = len(param_lists[0])
     for sample_i, params in enumerate(param_lists):
         if params is None:
             err('Parameter list for sample ' + str(sample_i) + ' is None')
         if len(params) != n_params:
             err('Parameter list for sample ' + str(sample_i) + ' (' +
                 str(len(params)) +
                 ') does not equal to the one for sample 1 (' +
                 str(n_params) + ')')
     res = self._view.view.map(
         fn,
         *([params[param_i] for params in param_lists]
           for param_i in range(n_params)))
     return res
Exemplo n.º 12
0
def file_reasonable_size(output_fpath, input_fpath):
    ok = file_exists_check(output_fpath)
    if not ok:
        return ok
    # named pipes -- we can't calculate size
    if input_fpath.strip().startswith("<("):
        return True
    if input_fpath.endswith((".bam", ".gz")):
        scale = 7.0
    else:
        scale = 10.0
    orig_size = os.path.getsize(input_fpath) / pow(1024.0, 3)
    out_size = os.path.getsize(output_fpath) / pow(1024.0, 3)
    if out_size < (orig_size / scale):
        err("Output file unexpectedly small. %.1fGb for output versus "
            "%.1fGb for the input file. This often indicates a truncated "
            "BAM file or memory errors during the run." % (out_size, orig_size))
        return False
    else:
        return True
Exemplo n.º 13
0
def proc_fastq(samples,
               parall_view,
               work_dir,
               bwa_prefix,
               downsample_to,
               num_pairs_by_sample=None,
               dedup=True):
    num_pairs_by_sample = num_pairs_by_sample or dict()
    if downsample_to:
        # Read pairs counts
        debug()
        if all(s.name in num_pairs_by_sample for s in samples):
            debug('Using read pairs counts extracted from FastQC reports')
        elif all(
                can_reuse(make_pair_counts_fpath(join(work_dir, s.name)),
                          s.l_fpath) for s in samples):
            debug('Reusing pairs counts, reading from files')
            num_pairs_by_sample = {
                s.name: int(
                    open(make_pair_counts_fpath(join(work_dir,
                                                     s.name))).read().strip())
                for s in samples
            }
        else:
            info('Counting read pairs')
            num_pairs = parall_view.run(
                count_read_pairs,
                [[s.name,
                  safe_mkdir(join(work_dir, s.name)), s.l_fpath]
                 for s in samples])
            num_pairs_by_sample = {
                s.name: pairs_count
                for s, pairs_count in zip(samples, num_pairs)
            }

        # Downsampling
        debug()
        if all(
                can_reuse(
                    make_downsampled_fpath(join(work_dir, s.name), s.l_fpath),
                    s.l_fpath) and can_reuse(
                        make_downsampled_fpath(join(work_dir, s.name),
                                               s.r_fpath), s.r_fpath)
                for s in samples):
            debug('Reusing downsampled FastQ')
            for s in samples:
                s.l_fpath = make_downsampled_fpath(join(work_dir, s.name),
                                                   s.l_fpath)
                s.r_fpath = make_downsampled_fpath(join(work_dir, s.name),
                                                   s.r_fpath)
        else:
            if isinstance(downsample_to, float):
                info('Downsampling FastQ to ' + str(float(downsample_to)) +
                     ' fraction of reads')
            else:
                info('Downsampling FastQ to ' + str(int(downsample_to)) +
                     ' read pairs')
            fastq_pairs = parall_view.run(downsample, [[
                join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath,
                downsample_to,
                num_pairs_by_sample.get(s.name)
            ] for s in samples])
            for s, (l_r, r_r) in zip(samples, fastq_pairs):
                s.l_fpath = l_r
                s.r_fpath = r_r
    else:
        info('Skipping downsampling')

    debug()
    if all(
            can_reuse(make_bam_fpath(join(work_dir, s.name)),
                      [s.l_fpath, s.r_fpath]) for s in samples):
        debug('All downsampled BAM exists, reusing')
        for s in samples:
            s.bam = make_bam_fpath(join(work_dir, s.name))
    else:
        bwa = which('bwa')
        if not isfile(bwa):
            critical('BWA not found under ' + bwa)
        smb = sambamba.get_executable()
        if not (bwa and smb):
            if not bwa:
                err('Error: bwa is required for the alignment pipeline')
            if not smb:
                err('Error: sambamba is required for the alignment pipeline')
            critical('Tools required for alignment not found')
        info('Aligning reads to the reference')
        bam_fpaths = parall_view.run(align, [[
            join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb,
            bwa_prefix, dedup, parall_view.cores_per_job
        ] for s in samples])

        bam_fpaths = [verify_bam(b) for b in bam_fpaths]
        if len(bam_fpaths) < len(samples):
            critical('Some samples were not aligned successfully.')
        for bam, s in zip(bam_fpaths, samples):
            s.bam = bam

    return num_pairs_by_sample
Exemplo n.º 14
0
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir):
    metric_storage = get_detailed_metric_storage(depth_threshs)

    report = PerRegionSampleReport(sample='Best',
                                   metric_storage=metric_storage)
    report.add_record(
        'Sample', 'contains best values from all samples: ' +
        ', '.join([s.name for s in samples]))

    total_regions = 0
    fpaths = [
        s.targqc_region_tsv for s in samples
        if verify_file(s.targqc_region_tsv)
    ]
    if not fpaths:
        err('No targetcov detailed per-gene report was generated; skipping.')
        return None

    open_tsv_files = [open(fpath) for fpath in fpaths]

    first_col = 0
    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break
        l = lines_for_each_sample[0]
        if l.startswith('##'):
            continue
        elif l.startswith('#'):
            if l.startswith('#Sample'):
                first_col = 1
            break

    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break

        if all([
                not l.startswith('#')
                and ('Whole-Gene' in l or 'Gene-Exon' in l)
                for l in lines_for_each_sample
        ]):
            shared_fields = lines_for_each_sample[0].split(
                '\t')[first_col:first_col + 9]
            reg = report.add_row()
            reg.add_record('Chr', get_val(shared_fields[0]))
            reg.add_record('Start', get_int_val(shared_fields[1]))
            reg.add_record('End', get_int_val(shared_fields[2]))
            reg.add_record('Size', get_int_val(shared_fields[3]))
            reg.add_record('Gene', get_val(shared_fields[4]))
            reg.add_record('Strand', get_val(shared_fields[5]))
            reg.add_record('Feature', get_val(shared_fields[6]))
            reg.add_record('Biotype', get_val(shared_fields[7]))
            reg.add_record('Transcript', get_val(shared_fields[8]))

            min_depths, ave_depths, stddevs, withins = ([], [], [], [])
            percents_by_threshs = {t: [] for t in depth_threshs}

            for l in lines_for_each_sample:
                fs = l.split('\t')

                min_depths.append(get_int_val(fs[first_col + 9]))
                ave_depths.append(get_float_val(fs[first_col + 10]))
                stddevs.append(get_float_val(fs[first_col + 11]))
                withins.append(get_float_val(fs[first_col + 12]))
                for t, f in zip(depth_threshs, fs[first_col + 13:]):
                    percents_by_threshs[t].append(get_float_val(f))

            # counting bests
            reg.add_record('Min depth', select_best(min_depths))
            reg.add_record('Ave depth', select_best(ave_depths))
            reg.add_record('Std dev', select_best(stddevs, max))
            reg.add_record('W/n 20% of median depth', select_best(withins))
            for t in depth_threshs:
                reg.add_record('{}x'.format(t),
                               select_best(percents_by_threshs[t]))

            total_regions += 1

    for f in open_tsv_files:
        f.close()

    gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best')
    txt_rep_fpath = report.save_txt(
        join(output_dir, gene_report_basename + '.txt'))
    tsv_rep_fpath = report.save_tsv(
        join(output_dir, gene_report_basename + '.tsv'))
    info('')
    info('Best values for the regions (total ' + str(total_regions) +
         ') saved into:')
    info('  ' + txt_rep_fpath)

    return txt_rep_fpath
Exemplo n.º 15
0
def proc_fastq(samples, parall_view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample=None, dedup=True):
    num_pairs_by_sample = num_pairs_by_sample or dict()
    if downsample_to:
        # Read pairs counts
        debug()
        if all(s.name in num_pairs_by_sample for s in samples):
            debug('Using read pairs counts extracted from FastQC reports')
        elif all(can_reuse(make_pair_counts_fpath(join(work_dir, s.name)), s.l_fpath) for s in samples):
            debug('Reusing pairs counts, reading from files')
            num_pairs_by_sample = {s.name: int(open(make_pair_counts_fpath(join(work_dir, s.name))).read().strip()) for s in samples}
        else:
            info('Counting read pairs')
            num_pairs = parall_view.run(count_read_pairs, [[s.name, safe_mkdir(join(work_dir, s.name)), s.l_fpath] for s in samples])
            num_pairs_by_sample = {s.name: pairs_count for s, pairs_count in zip(samples, num_pairs)}

        # Downsampling
        debug()
        if all(can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.l_fpath), s.l_fpath) and
               can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.r_fpath), s.r_fpath) for s in samples):
            debug('Reusing downsampled FastQ')
            for s in samples:
                s.l_fpath = make_downsampled_fpath(join(work_dir, s.name), s.l_fpath)
                s.r_fpath = make_downsampled_fpath(join(work_dir, s.name), s.r_fpath)
        else:
            if isinstance(downsample_to, float):
                info('Downsampling FastQ to ' + str(float(downsample_to)) + ' fraction of reads')
            else:
                info('Downsampling FastQ to ' + str(int(downsample_to)) + ' read pairs')
            fastq_pairs = parall_view.run(downsample,
                [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, downsample_to, num_pairs_by_sample.get(s.name)]
                 for s in samples])
            for s, (l_r, r_r) in zip(samples, fastq_pairs):
                s.l_fpath = l_r
                s.r_fpath = r_r
    else:
        info('Skipping downsampling')

    debug()
    if all(can_reuse(make_bam_fpath(join(work_dir, s.name)), [s.l_fpath, s.r_fpath]) for s in samples):
        debug('All downsampled BAM exists, reusing')
        for s in samples:
            s.bam = make_bam_fpath(join(work_dir, s.name))
    else:
        bwa = which('bwa')
        if not isfile(bwa):
            critical('BWA not found under ' + bwa)
        smb = sambamba.get_executable()
        if not (bwa and smb):
            if not bwa:         err('Error: bwa is required for the alignment pipeline')
            if not smb:         err('Error: sambamba is required for the alignment pipeline')
            critical('Tools required for alignment not found')
        info('Aligning reads to the reference')
        bam_fpaths = parall_view.run(align,
            [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb, bwa_prefix, dedup, parall_view.cores_per_job]
             for s in samples])

        bam_fpaths = [verify_bam(b) for b in bam_fpaths]
        if len(bam_fpaths) < len(samples):
            critical('Some samples were not aligned successfully.')
        for bam, s in zip(bam_fpaths, samples):
            s.bam = bam

    return num_pairs_by_sample
Exemplo n.º 16
0
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir):
    metric_storage = get_detailed_metric_storage(depth_threshs)

    report = PerRegionSampleReport(sample='Best', metric_storage=metric_storage)
    report.add_record('Sample', 'contains best values from all samples: ' + ', '.join([s.name for s in samples]))

    total_regions = 0
    fpaths = [s.targqc_region_tsv for s in samples if verify_file(s.targqc_region_tsv)]
    if not fpaths:
        err('No targetcov detailed per-gene report was generated; skipping.')
        return None

    open_tsv_files = [open(fpath) for fpath in fpaths]

    first_col = 0
    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break
        l = lines_for_each_sample[0]
        if l.startswith('##'):
            continue
        elif l.startswith('#'):
            if l.startswith('#Sample'):
                first_col = 1
            break

    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break

        if all([not l.startswith('#') and ('Whole-Gene' in l or 'Gene-Exon' in l) for l in lines_for_each_sample]):
            shared_fields = lines_for_each_sample[0].split('\t')[first_col:first_col+9]
            reg = report.add_row()
            reg.add_record('Chr', get_val(shared_fields[0]))
            reg.add_record('Start', get_int_val(shared_fields[1]))
            reg.add_record('End', get_int_val(shared_fields[2]))
            reg.add_record('Size', get_int_val(shared_fields[3]))
            reg.add_record('Gene', get_val(shared_fields[4]))
            reg.add_record('Strand', get_val(shared_fields[5]))
            reg.add_record('Feature', get_val(shared_fields[6]))
            reg.add_record('Biotype', get_val(shared_fields[7]))
            reg.add_record('Transcript', get_val(shared_fields[8]))

            min_depths, ave_depths, stddevs, withins = ([], [], [], [])
            percents_by_threshs = {t: [] for t in depth_threshs}

            for l in lines_for_each_sample:
                fs = l.split('\t')

                min_depths.append(get_int_val(fs[first_col+9]))
                ave_depths.append(get_float_val(fs[first_col+10]))
                stddevs.append(get_float_val(fs[first_col+11]))
                withins.append(get_float_val(fs[first_col+12]))
                for t, f in zip(depth_threshs, fs[first_col+13:]):
                    percents_by_threshs[t].append(get_float_val(f))

            # counting bests
            reg.add_record('Min depth', select_best(min_depths))
            reg.add_record('Ave depth', select_best(ave_depths))
            reg.add_record('Std dev', select_best(stddevs, max))
            reg.add_record('W/n 20% of median depth', select_best(withins))
            for t in depth_threshs:
                reg.add_record('{}x'.format(t), select_best(percents_by_threshs[t]))

            total_regions += 1

    for f in open_tsv_files:
        f.close()

    gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best')
    txt_rep_fpath = report.save_txt(join(output_dir, gene_report_basename + '.txt'))
    tsv_rep_fpath = report.save_tsv(join(output_dir, gene_report_basename + '.tsv'))
    info('')
    info('Best values for the regions (total ' + str(total_regions) + ') saved into:')
    info('  ' + txt_rep_fpath)

    return txt_rep_fpath