示例#1
0
def _split_reference_by_priority(cnf, features_bed_fpath):
    features = ['CDS', 'Exon', 'Transcript', 'Gene']
    info('Splitting the reference file into ' + ', '.join(features))
    features_and_beds = []
    for f in features:
        features_and_beds.append((f, BedTool(features_bed_fpath).filter(lambda x: x[6] == f)))
    return features_and_beds
示例#2
0
def make_region_reports(view, work_dir, samples, target, genome, depth_thresholds):
    bed_fpath = target.bed_fpath or target.wgs_bed_fpath

    if all(can_reuse(s.targqc_region_tsv, [s.bam, bed_fpath]) for s in samples):
        debug('All region reports exist, reusing')
        return [s.targqc_region_tsv for s in samples]

    info('Calculating coverage statistics for CDS and exon regions from RefSeq...')

    depth_thresholds_by_sample = dict()
    for s in samples:
        depth_thresholds_by_sample[s.name] = depth_thresholds

    debug()
    debug('Running sambamba...')
    sambamba_depth_output_fpaths = view.run(sambamba_depth,
        [[s.work_dir, bed_fpath, s.bam, depth_thresholds_by_sample[s.name], None, s.name]
         for s in samples])
    assert len(sambamba_depth_output_fpaths) == len(samples), \
        'Number of sambamba results = ' + str(len(sambamba_depth_output_fpaths)) + \
        ' which is less then the number of samples ' + str(len(samples))

    debug()
    debug('Parsing sambamba results and writing results...')
    view.run(_proc_sambamba_depth,
        [[sambamba_output_fpath, s.targqc_region_tsv, s.name, depth_thresholds_by_sample[s.name]]
         for sambamba_output_fpath, s in zip(sambamba_depth_output_fpaths, samples)])

    info('Done.')
    return [s.targqc_region_tsv for s in samples]
示例#3
0
def _do_run(cmd, checks, env=None, output_fpath=None, input_fpath=None):
    """Perform running and check results, raising errors for issues.
    """
    cmd, shell_arg, executable_arg = _normalize_cmd_args(cmd)
    s = subprocess.Popen(cmd, shell=shell_arg, executable=executable_arg,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT, close_fds=True, env=env)
    debug_stdout = collections.deque(maxlen=100)
    while 1:
        line = s.stdout.readline()
        if line:
            if six.PY3: line = line.decode(errors='replace')
            debug_stdout.append(line)
            info('  ' + line.rstrip())
        exitcode = s.poll()
        if exitcode is not None:
            for line in s.stdout:
                if six.PY3: line = line.decode(errors='replace')
                debug_stdout.append(line)
            if exitcode is not None and exitcode != 0:
                error_msg = " ".join(cmd) if not isinstance(cmd, six.string_types) else cmd
                error_msg += "\n"
                error_msg += "".join(debug_stdout)
                s.communicate()
                s.stdout.close()
                raise subprocess.CalledProcessError(exitcode, cmd=cmd, output=error_msg)
            else:
                break
    s.communicate()
    s.stdout.close()
    # Check for problems not identified by shell return codes
    if checks:
        for check in checks:
            if not check(output_fpath, input_fpath):
                raise IOError("External command failed")
示例#4
0
def align(work_dir,
          sample_name,
          l_fpath,
          r_fpath,
          bwa,
          smb,
          bwa_prefix,
          dedup=True,
          threads=1):
    info('Running bwa to align reads...')
    bam_fpath = make_bam_fpath(work_dir)
    if can_reuse(bam_fpath, [l_fpath, r_fpath]):
        return bam_fpath

    tmp_dirpath = join(work_dir, 'sambamba_tmp_dir')
    safe_mkdir(tmp_dirpath)

    bwa_cmdline = (
        '{bwa} mem -t {threads} -v 2 {bwa_prefix} {l_fpath} {r_fpath} | ' +
        '{smb} view /dev/stdin -t {threads} -f bam -S -o - | ' +
        '{smb} sort /dev/stdin -t {threads} --tmpdir {tmp_dirpath} -o {bam_fpath}'
    ).format(**locals())
    run(bwa_cmdline, output_fpath=bam_fpath, stdout_to_outputfile=False)

    if dedup:
        dedup_bam_fpath = add_suffix(bam_fpath, 'dedup')
        dedup_cmdl = '{smb} markdup -t {threads} {bam_fpath} {dedup_bam_fpath}'.format(
            **locals())
        run(dedup_cmdl,
            output_fpath=dedup_bam_fpath,
            stdout_to_outputfile=False)
        verify_bam(dedup_bam_fpath)
        os.rename(dedup_bam_fpath, bam_fpath)

    sambamba.index_bam(bam_fpath)

    # samtools view -b -S -u - |
    # sambamba sort -N -t 8 -m 682M --tmpdir /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000-sorttmp-full
    # -o /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000.bam
    # /dev/stdin

    # if dedup:
    #     info()
    #     info('Calling SamBlaster to mark duplicates')
    #     markdup_sam_fpath = markdup_sam(sam_fpath, samblaster)
    #     if markdup_sam_fpath:
    #         sam_fpath = markdup_sam_fpath
    # info()

    # info('Converting to BAM')
    # cmdline = sambamba.get_executable() + ' view -t {threads} -S -f bam {sam_fpath}'.format(**locals())
    # run(cmdline, output_fpath=bam_fpath, reuse=cfg.reuse_intermediate)
    #
    # info()
    # info('Sorting BAM')
    # prefix = splitext(sorted_bam_fpath)[0]
    # cmdline = sambamba.get_executable() + ' sort -t {threads} {bam_fpath} -o {sorted_bam_fpath}'.format(**locals())
    # run(cmdline, output_fpath=sorted_bam_fpath, stdout_to_outputfile=False, reuse=cfg.reuse_intermediate)

    return bam_fpath
示例#5
0
def _split_reference_by_priority(cnf, features_bed_fpath):
    features = ['CDS', 'Exon', 'Transcript', 'Gene']
    info('Splitting the reference file into ' + ', '.join(features))
    features_and_beds = []
    for f in features:
        features_and_beds.append(
            (f, BedTool(features_bed_fpath).filter(lambda x: x[6] == f)))
    return features_and_beds
示例#6
0
def _make_targetcov_symlinks(samples):
    for sample in samples:
        new_link = join(dirname(dirname(sample.targetcov_detailed_txt)),
                        basename(sample.targetcov_detailed_txt))
        if exists(new_link):
            os.unlink(new_link)
        symlink_plus(sample.targetcov_detailed_txt, new_link)
        info('TargetCov TXT symlink saved to ' + new_link)
示例#7
0
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome + ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.')
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.')
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
             ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex
示例#8
0
def get_padded_bed_file(work_dir, bed, padding, fai_fpath):
    genome_fpath = fai_fpath
    info('Making bed file for padded regions...')
    bedtools = which('bedtools')
    cmdline = '{bedtools} slop -i {bed} -g {genome_fpath} -b {padding}'.format(
        **locals())
    output_fpath = intermediate_fname(work_dir, bed, 'padded')
    call_process.run(cmdline, output_fpath=output_fpath)
    return output_fpath
示例#9
0
def _make_targetcov_symlinks(samples):
    for sample in samples:
        new_link = join(
            dirname(dirname(sample.targetcov_detailed_txt)),
            basename(sample.targetcov_detailed_txt))
        if exists(new_link):
            os.unlink(new_link)
        symlink_plus(sample.targetcov_detailed_txt, new_link)
        info('TargetCov TXT symlink saved to ' + new_link)
示例#10
0
def run(cmd, output_fpath=None, input_fpath=None, checks=None, stdout_to_outputfile=True,
        stdout_tx=True, reuse=False, env_vars=None):
    """Run the provided command, logging details and checking for errors.
    """
    if output_fpath and reuse:
        if verify_file(output_fpath, silent=True):
            info(output_fpath + ' exists, reusing')
            return output_fpath
        if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True):
            info(output_fpath + '.gz exists, reusing')
            return output_fpath

    env = os.environ.copy()
    if env_vars:
        for k, v in env_vars.items():
            if v is None:
                if k in env:
                    del env[k]
            else:
                env[k] = v

    if checks is None:
        checks = [file_nonempty_check]

    def _try_run(_cmd, _output_fpath, _input_fpath):
        try:
            info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, six.string_types) else _cmd)
            _do_run(_cmd, checks, env, _output_fpath, _input_fpath)
        except:
            raise

    if output_fpath:
        if isfile(output_fpath):
            os.remove(output_fpath)
    if output_fpath:
        if stdout_tx:
            with file_transaction(None, output_fpath) as tx_out_file:
                if stdout_to_outputfile:
                    cmd += ' > ' + tx_out_file
                else:
                    cmd += '\n'
                    cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \
                             .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \
                             .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \
                             .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \
                             .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \
                             .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \
                             .replace('\n', '')
                _try_run(cmd, tx_out_file, input_fpath)
        else:
            _try_run(cmd, output_fpath, input_fpath)

    else:
        _try_run(cmd, None, input_fpath)
示例#11
0
def count_read_pairs(s_name, work_dir, fastq_fpath):
    from targqc.utilz.logger import info

    pairs_counts_fpath = make_pair_counts_fpath(work_dir)
    if can_reuse(pairs_counts_fpath, fastq_fpath):
        with open(pairs_counts_fpath) as f:
            return int(f.read().strip())
    else:
        info('Counting read pairs in ' + s_name + ', writing to ' + pairs_counts_fpath)
        pairs_number = _count_records_in_fastq(fastq_fpath)
        with open(pairs_counts_fpath, 'w') as out:
            out.write(str(pairs_number))
        return pairs_number
示例#12
0
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs):
    gzipped_fpath = join(fpath + '.gz')
    tbi_fpath = gzipped_fpath + '.tbi'

    if reuse and \
           file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \
           file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath):
        info('Actual compressed file and index exist, reusing')
        return gzipped_fpath

    info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)')
    bgzip = which('bgzip')
    tabix = which('tabix')
    if not bgzip:
        err('Cannot index file because bgzip is not found')
    if not tabix:
        err('Cannot index file because tabix is not found')
    if not bgzip and not tabix:
        return fpath

    if isfile(gzipped_fpath):
        os.remove(gzipped_fpath)
    if isfile(tbi_fpath):
        os.remove(tbi_fpath)

    info('BGzipping ' + fpath)
    cmdline = '{bgzip} {fpath}'.format(**locals())
    call_process.run(cmdline)

    info('Tabixing ' + gzipped_fpath)
    cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals())
    call_process.run(cmdline)

    return gzipped_fpath
示例#13
0
def count_read_pairs(s_name, work_dir, fastq_fpath):
    from targqc.utilz.logger import info

    pairs_counts_fpath = make_pair_counts_fpath(work_dir)
    if can_reuse(pairs_counts_fpath, fastq_fpath):
        with open(pairs_counts_fpath) as f:
            return int(f.read().strip())
    else:
        info('Counting read pairs in ' + s_name + ', writing to ' +
             pairs_counts_fpath)
        pairs_number = _count_records_in_fastq(fastq_fpath)
        with open(pairs_counts_fpath, 'w') as out:
            out.write(str(pairs_number))
        return pairs_number
示例#14
0
def set_up_dirs(proc_name, output_dir=None, work_dir=None, log_dir=None):
    """ Creates output_dir, work_dir, and sets up log
    """
    output_dir = safe_mkdir(
        adjust_path(output_dir or join(os.getcwd(), proc_name)), 'output_dir')
    debug('Saving results into ' + output_dir)

    work_dir = safe_mkdir(work_dir or join(output_dir, 'work'),
                          'working directory')
    info('Using work directory ' + work_dir)

    log_fpath = set_up_log(log_dir or safe_mkdir(join(work_dir, 'log')),
                           proc_name + '.log')

    return output_dir, work_dir, log_fpath
示例#15
0
def find_fastq_pairs(fpaths):
    info('Finding FastQ pairs...')
    fastqs_by_sample_name = dict()
    for fpath in fpaths:
        fn, ext = splitext_plus(basename(fpath))
        if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']:
            sname, l_fpath, r_fpath = None, None, None
            if fn.endswith('_1'):
                sname = fn[:-2]
                l_fpath = fpath
            if fn.endswith('_R1'):
                sname = fn[:-3]
                l_fpath = fpath
            if fn.endswith('_2'):
                sname = fn[:-2]
                r_fpath = fpath
            if fn.endswith('_R2'):
                sname = fn[:-3]
                r_fpath = fpath

            if sname:
                m = re.match(r'(.*)_S\d+', sname)
                if m:
                    sname = m.group(1)
                sname = sname.replace('-', '_')
            else:
                sname = fn
                info('Cannot detect file for ' + sname)

            l, r = fastqs_by_sample_name.get(sname, (None, None))
            if l and l_fpath:
                critical('Duplicated left FastQ files for ' + sname + ': ' +
                         l + ' and ' + l_fpath)
            if r and r_fpath:
                critical('Duplicated right FastQ files for ' + sname + ': ' +
                         r + ' and ' + r_fpath)
            fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath

    fixed_fastqs_by_sample_name = dict()
    for sname, (l, r) in fastqs_by_sample_name.items():
        if not l:
            err('ERROR: for sample ' + sname + ', left reads not found')
        if not r:
            err('ERROR: for sample ' + sname + ', right reads not found')
        if l and r:
            fixed_fastqs_by_sample_name[sname] = l, r

    return fixed_fastqs_by_sample_name
示例#16
0
def annotate_target(work_dir, target_bed, genome_build):
    output_fpath = intermediate_fname(work_dir, target_bed, 'ann')
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    annotate_bed_py = which('annotate_bed.py')
    if not annotate_bed_py:
        critical(
            'Error: annotate_bed.py not found in PATH, please install TargQC.')

    cmdline = '{annotate_bed_py} {target_bed} -g {genome_build} -o {output_fpath}'.format(
        **locals())
    run(cmdline, output_fpath, stdout_to_outputfile=False)
    output_fpath = clean_bed(output_fpath, work_dir)
    return output_fpath
示例#17
0
def align(work_dir, sample_name, l_fpath, r_fpath, bwa, smb, bwa_prefix, dedup=True, threads=1):
    info('Running bwa to align reads...')
    bam_fpath = make_bam_fpath(work_dir)
    if can_reuse(bam_fpath, [l_fpath, r_fpath]):
        return bam_fpath

    tmp_dirpath = join(work_dir, 'sambamba_tmp_dir')
    safe_mkdir(tmp_dirpath)

    bwa_cmdline = ('{bwa} mem -t {threads} -v 2 {bwa_prefix} {l_fpath} {r_fpath} | ' +
                   '{smb} view /dev/stdin -t {threads} -f bam -S -o - | ' +
                   '{smb} sort /dev/stdin -t {threads} --tmpdir {tmp_dirpath} -o {bam_fpath}').format(**locals())
    run(bwa_cmdline, output_fpath=bam_fpath, stdout_to_outputfile=False)

    if dedup:
        dedup_bam_fpath = add_suffix(bam_fpath, 'dedup')
        dedup_cmdl = '{smb} markdup -t {threads} {bam_fpath} {dedup_bam_fpath}'.format(**locals())
        run(dedup_cmdl, output_fpath=dedup_bam_fpath, stdout_to_outputfile=False)
        verify_bam(dedup_bam_fpath)
        os.rename(dedup_bam_fpath, bam_fpath)

    sambamba.index_bam(bam_fpath)

# samtools view -b -S -u - |
# sambamba sort -N -t 8 -m 682M --tmpdir /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000-sorttmp-full
    # -o /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000.bam
    # /dev/stdin

    # if dedup:
    #     info()
    #     info('Calling SamBlaster to mark duplicates')
    #     markdup_sam_fpath = markdup_sam(sam_fpath, samblaster)
    #     if markdup_sam_fpath:
    #         sam_fpath = markdup_sam_fpath
    # info()

    # info('Converting to BAM')
    # cmdline = sambamba.get_executable() + ' view -t {threads} -S -f bam {sam_fpath}'.format(**locals())
    # run(cmdline, output_fpath=bam_fpath, reuse=cfg.reuse_intermediate)
    #
    # info()
    # info('Sorting BAM')
    # prefix = splitext(sorted_bam_fpath)[0]
    # cmdline = sambamba.get_executable() + ' sort -t {threads} {bam_fpath} -o {sorted_bam_fpath}'.format(**locals())
    # run(cmdline, output_fpath=sorted_bam_fpath, stdout_to_outputfile=False, reuse=cfg.reuse_intermediate)

    return bam_fpath
示例#18
0
def _prep_report_data(sample, depth_stats, reads_stats, indels_stats, target_stats,
                      target, num_pairs_by_sample, genome, depth_threshs, fai_fpath=None):
    sample.avg_depth = depth_stats['ave_depth']

    if num_pairs_by_sample and sample.name in num_pairs_by_sample:
        reads_stats['original_num_reads'] = num_pairs_by_sample[sample.name] * 2

    chrom_lengths = reference_data.get_chrom_lengths(genome=genome, fai_fpath=fai_fpath)
    if 'Y' in chrom_lengths or 'chrY' in chrom_lengths:
        reads_stats['gender'] = determine_sex(sample.work_dir, sample.bam, depth_stats['ave_depth'],
                                              genome, target.get_capture_bed())
        info()

    if 'bases_by_depth' in depth_stats:
        depth_stats['bases_within_threshs'], depth_stats['rates_within_threshs'] = calc_bases_within_threshs(
            depth_stats['bases_by_depth'],
            target_stats['target_size'] if not target.is_wgs else target_stats['reference_size'],
            depth_threshs)

        if depth_stats['median_depth'] > 0:
            depth_stats['wn_20_percent'] = calc_rate_within_normal(
                depth_stats['bases_by_depth'],
                depth_stats['median_depth'],
                target_stats['target_size'] if not target.is_wgs else target_stats['reference_size'])

    if target_stats['target_size']:
        target.bases_num = target_stats['target_size']
        target.fraction  = target_stats['target_fraction']
    else:
        target.bases_num = target_stats['reference_size']

    reads_stats['mapped_dedup'] = number_of_mapped_reads(sample.work_dir, sample.bam, dedup=True)

    if not target.is_wgs:
        reads_stats['mapped_dedup_on_target'] = number_mapped_reads_on_target(
            sample.work_dir, target.get_capture_bed().cut(range(3)).saveas().fn, sample.bam, dedup=True, target_name='target') or 0

        reads_stats['mapped_dedup_on_padded_target'] = number_mapped_reads_on_target(
            sample.work_dir, target.padded_bed_fpath, sample.bam, dedup=True, target_name='padded_target') or 0

    else:
        cds_bed = get_merged_cds(genome)
        info('Using the CDS reference BED to calc "reads on CDS"')
        reads_stats['mapped_dedup_on_exome'] = number_mapped_reads_on_target(
            sample.work_dir, cds_bed, sample.bam, dedup=True, target_name='exome') or 0

    return depth_stats, reads_stats, indels_stats
示例#19
0
def _get_qualimap_version(tool_cmdline):
    cmdline = tool_cmdline + ' -version'  # actually, Qualimap doesn't have -version option

    version = None
    with subprocess.Popen(cmdline,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.STDOUT,
                          shell=True).stdout as stdout:
        out = stdout.read().strip()
        flag = "QualiMap v."
        if out.startswith(flag) >= 0:
            version = out.split(flag)[-1].strip()
    if not version:
        info('WARNING: could not determine Qualimap version, using 1.0')
        return '1.0'
    if version.split('.') > 2:  # only major version
        version = '.'.join(version.split('.')[:2])
    return version
示例#20
0
def _get_qualimap_version(tool_cmdline):
    cmdline = tool_cmdline + ' -version'  # actually, Qualimap doesn't have -version option

    version = None
    with subprocess.Popen(cmdline,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.STDOUT,
                          shell=True).stdout as stdout:
        out = stdout.read().strip()
        flag = "QualiMap v."
        if out.startswith(flag) >= 0:
            version = out.split(flag)[-1].strip()
    if not version:
        info('WARNING: could not determine Qualimap version, using 1.0')
        return '1.0'
    if version.split('.') > 2:  # only major version
        version = '.'.join(version.split('.')[:2])
    return version
示例#21
0
def make_general_reports(view, samples, target, genome, depth_threshs, bed_padding,
                         num_pairs_by_sample=None, reuse=False, is_debug=False, reannotate=False, fai_fpath=None):
    if all(all(can_reuse(fp, [s.bam, target.qualimap_bed_fpath] if target.bed else s.bam)
               for fp in _qualimap_outputs(s))
           for s in samples):
        debug('All QualiMap files for all samples exist and newer than BAMs and BEDs, reusing')
    else:
        info('Running QualiMap...')
        view.run(runner.run_qualimap,
            [[s.work_dir, s.qualimap_dirpath, _qualimap_outputs(s), s.bam, genome, target.qualimap_bed_fpath, view.cores_per_job]
             for s in samples])

        for s in samples:
            for fp in _qualimap_outputs(s):
                verify_file(fp, is_critical=True)

    summary_reports = []

    for sample in samples:
        info('-'*70)
        info(sample.name)
        debug('-'*70)
        debug('Parsing QualiMap results...')
        depth_stats, reads_stats, indels_stats, target_stats = parse_qualimap_results(sample)

        _prep_report_data(sample, depth_stats, reads_stats, indels_stats, target_stats,
                          target, num_pairs_by_sample, genome, depth_threshs, fai_fpath=fai_fpath)

        r = _build_report(depth_stats, reads_stats, indels_stats, sample, target,
                          depth_threshs, bed_padding, sample_num=len(samples), is_debug=is_debug,
                          reannotate=reannotate)
        summary_reports.append(r)

    return summary_reports
示例#22
0
def read_samples(args):
    bam_by_sample = find_bams(args)
    if bam_by_sample:
        info('Found ' + str(len(bam_by_sample)) + ' BAM file' +
             ('s' if len(bam_by_sample) > 1 else ''))

    input_not_bam = [
        verify_file(fpath) for fpath in args
        if adjust_path(fpath) not in bam_by_sample
    ]
    input_not_bam = [fpath for fpath in input_not_bam if fpath]
    fastqs_by_sample = dict()
    if not input_not_bam and not bam_by_sample:
        critical('No correct input files')
    if input_not_bam:
        info('Input ' + str(len(input_not_bam)) +
             ' correct input non-BAM files')
        fastqs_by_sample = find_fastq_pairs(input_not_bam)
        if fastqs_by_sample:
            info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs')
        intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys())
        if intersection:
            critical('The following samples both had input BAMs and FastQ: ' +
                     ', '.join(list(intersection)))

    return fastqs_by_sample, bam_by_sample
示例#23
0
def main():
    options = [
        (['-g', '--genome'], dict(
            dest='genome',
            help='Genome build. Accepted values: ' + ', '.join(ebl.SUPPORTED_GENOMES),
        )),
    ]
    parser = OptionParser()
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()

    if not opts.genome:
        critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)')
    genome = opts.genome

    debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES))

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x: x[ebl.BedCols.FEATURE] == 'CDS')
    features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome))

    info('Saving CDS regions...')
    output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed'))
    with file_transaction(None, output_fpath) as tx:
        features_bed.cut(range(6)).saveas(tx)
    info('Done, saved to ' + output_fpath)
示例#24
0
def run_qualimap(work_dir,
                 output_dir,
                 output_fpaths,
                 bam_fpath,
                 genome,
                 bed_fpath=None,
                 threads=1):
    info('Analysing ' + bam_fpath)

    safe_mkdir(dirname(output_dir))
    safe_mkdir(output_dir)

    mem_cmdl = ''
    mem_m = get_qualimap_max_mem(bam_fpath)
    mem = str(int(mem_m)) + 'M'
    mem_cmdl = '--java-mem-size=' + mem

    cmdline = (find_executable() +
               ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 '
               '-bam {bam_fpath} -outdir {output_dir}')

    if genome.startswith('hg') or genome.startswith('GRCh'):
        cmdline += ' -gd HUMAN'
    if genome.startswith('mm'):
        cmdline += ' -gd MOUSE'

    if bed_fpath:
        cmdline += ' -gff {bed_fpath}'
        debug('Using amplicons/capture panel ' + bed_fpath)

    cmdline = cmdline.format(**locals())
    if not all(
            can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath])
            for fp in output_fpaths):
        for fp in output_fpaths:
            if isfile(fp):
                os.remove(fp)
        try:
            run(cmdline, env_vars=dict(DISPLAY=None))
        except subprocess.CalledProcessError as e:
            if 'The alignment file is unsorted.' in e.output:
                info()
                info('BAM file is unsorted; trying to sort and rerun QualiMap')
                sorted_bam_fpath = sort_bam(bam_fpath)
                cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath)
                run(cmdline, env_vars=dict(DISPLAY=None))

    if not all(
            verify_file(
                fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath])
            for fp in output_fpaths):
        critical('Some of the QualiMap results were not generated')

    return output_dir
示例#25
0
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'):
    info(
        'Converting the BAM to BED to save some memory.'
    )  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz'
    cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format(
        **locals())
    info(cmdline)
    os.system(cmdline)
    bam_bed_fpath = verify_file(bam_bed_fpath)
    if bam_bed_fpath:
        info('Done, saved to ' + bam_bed_fpath)
    else:
        err('Error, result is non-existent or empty')
    return bam_bed_fpath
示例#26
0
def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1):
    info('Analysing ' + bam_fpath)

    safe_mkdir(dirname(output_dir))
    safe_mkdir(output_dir)

    mem_cmdl = ''
    mem_m = get_qualimap_max_mem(bam_fpath)
    mem = str(int(mem_m)) + 'M'
    mem_cmdl = '--java-mem-size=' + mem

    cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 '
        '-bam {bam_fpath} -outdir {output_dir}')

    if genome.startswith('hg') or genome.startswith('GRCh'):
        cmdline += ' -gd HUMAN'
    if genome.startswith('mm'):
        cmdline += ' -gd MOUSE'

    if bed_fpath:
        cmdline += ' -gff {bed_fpath}'
        debug('Using amplicons/capture panel ' + bed_fpath)

    cmdline = cmdline.format(**locals())
    if not all(can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths):
        for fp in output_fpaths:
            if isfile(fp):
                os.remove(fp)
        try:
            run(cmdline, env_vars=dict(DISPLAY=None))
        except subprocess.CalledProcessError as e:
            if 'The alignment file is unsorted.' in e.output:
                info()
                info('BAM file is unsorted; trying to sort and rerun QualiMap')
                sorted_bam_fpath = sort_bam(bam_fpath)
                cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath)
                run(cmdline, env_vars=dict(DISPLAY=None))

    if not all(verify_file(fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths):
        critical('Some of the QualiMap results were not generated')

    return output_dir
示例#27
0
def _annotate(bed,
              ref_bed,
              chr_order,
              fai_fpath,
              work_dir,
              ori_col_num,
              high_confidence=False,
              reannotate=False,
              is_debug=False,
              **kwargs):
    # if genome:
    # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2'))
    # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')')
    # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0])
    # else:

    intersection_bed = None
    intersection_fpath = None

    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    if is_debug:
        intersection_fpath = join(work_dir, 'intersection.bed')
        if isfile(intersection_fpath):
            info('Loading from ' + intersection_fpath)
            intersection_bed = BedTool(intersection_fpath)
    if not intersection_bed:
        if count_bed_cols(fai_fpath) == 2:
            debug('Fai fields size is 2 ' + fai_fpath)
            intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath)
        else:
            debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) +
                  ', not 2')
            intersection_bed = bed.intersect(ref_bed, wao=True)
    if is_debug and not isfile(intersection_fpath):
        intersection_bed.saveas(intersection_fpath)
        debug('Saved intersection to ' + intersection_fpath)

    total_annotated = 0
    total_uniq_annotated = 0
    total_off_target = 0

    met = set()

    overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict(
        lambda: OrderedDefaultDict(lambda: defaultdict(list)))
    # off_targets = list()

    expected_fields_num = ori_col_num + len(ebl.BedCols.cols[:-4]) + 1
    for i, intersection_fields in enumerate(intersection_bed):
        inters_fields_list = list(intersection_fields)
        if len(inters_fields_list) < expected_fields_num:
            critical(
                f'Cannot parse the reference BED file - unexpected number of lines '
                '({len(inters_fields_list} in {inters_fields_list}' +
                ' (less than {expected_fields_num})')

        a_chr, a_start, a_end = intersection_fields[:3]
        a_extra_columns = intersection_fields[3:ori_col_num]

        overlap_fields = [None for _ in ebl.BedCols.cols]

        overlap_fields[:len(intersection_fields[ori_col_num:-1]
                            )] = intersection_fields[ori_col_num:-1]
        keep_gene_column = not reannotate
        a_gene = None
        if keep_gene_column:
            a_gene = a_extra_columns[0]

        e_chr = overlap_fields[0]
        overlap_size = int(intersection_fields[-1])
        assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}'

        # fs = [None for _ in ebl.BedCols.cols]
        # fs[:3] = [a_chr, a_start, a_end]
        reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns))

        if e_chr == '.':
            total_off_target += 1
            # off_targets.append(fs)
            overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(
                list)

        else:
            # fs[3:-1] = db_feature_fields[3:-1]
            total_annotated += 1
            if (a_chr, a_start, a_end) not in met:
                total_uniq_annotated += 1
                met.add((a_chr, a_start, a_end))

            e_gene = overlap_fields[ebl.BedCols.GENE]
            if keep_gene_column and e_gene != a_gene:
                overlaps_by_tx_by_gene_by_loc[reg][
                    a_gene] = OrderedDefaultDict(list)
            else:
                transcript_id = overlap_fields[ebl.BedCols.ENSEMBL_ID]
                overlaps_by_tx_by_gene_by_loc[reg][e_gene][
                    transcript_id].append((overlap_fields, overlap_size))

    info('  Total annotated regions: ' + str(total_annotated))
    info('  Total unique annotated regions: ' + str(total_uniq_annotated))
    info('  Total off target regions: ' + str(total_off_target))
    info('Resolving ambiguities...')
    annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order,
                                     **kwargs)

    return annotated
示例#28
0
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir):
    metric_storage = get_detailed_metric_storage(depth_threshs)

    report = PerRegionSampleReport(sample='Best',
                                   metric_storage=metric_storage)
    report.add_record(
        'Sample', 'contains best values from all samples: ' +
        ', '.join([s.name for s in samples]))

    total_regions = 0
    fpaths = [
        s.targqc_region_tsv for s in samples
        if verify_file(s.targqc_region_tsv)
    ]
    if not fpaths:
        err('No targetcov detailed per-gene report was generated; skipping.')
        return None

    open_tsv_files = [open(fpath) for fpath in fpaths]

    first_col = 0
    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break
        l = lines_for_each_sample[0]
        if l.startswith('##'):
            continue
        elif l.startswith('#'):
            if l.startswith('#Sample'):
                first_col = 1
            break

    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break

        if all([
                not l.startswith('#')
                and ('Whole-Gene' in l or 'Gene-Exon' in l)
                for l in lines_for_each_sample
        ]):
            shared_fields = lines_for_each_sample[0].split(
                '\t')[first_col:first_col + 9]
            reg = report.add_row()
            reg.add_record('Chr', get_val(shared_fields[0]))
            reg.add_record('Start', get_int_val(shared_fields[1]))
            reg.add_record('End', get_int_val(shared_fields[2]))
            reg.add_record('Size', get_int_val(shared_fields[3]))
            reg.add_record('Gene', get_val(shared_fields[4]))
            reg.add_record('Strand', get_val(shared_fields[5]))
            reg.add_record('Feature', get_val(shared_fields[6]))
            reg.add_record('Biotype', get_val(shared_fields[7]))
            reg.add_record('Transcript', get_val(shared_fields[8]))

            min_depths, ave_depths, stddevs, withins = ([], [], [], [])
            percents_by_threshs = {t: [] for t in depth_threshs}

            for l in lines_for_each_sample:
                fs = l.split('\t')

                min_depths.append(get_int_val(fs[first_col + 9]))
                ave_depths.append(get_float_val(fs[first_col + 10]))
                stddevs.append(get_float_val(fs[first_col + 11]))
                withins.append(get_float_val(fs[first_col + 12]))
                for t, f in zip(depth_threshs, fs[first_col + 13:]):
                    percents_by_threshs[t].append(get_float_val(f))

            # counting bests
            reg.add_record('Min depth', select_best(min_depths))
            reg.add_record('Ave depth', select_best(ave_depths))
            reg.add_record('Std dev', select_best(stddevs, max))
            reg.add_record('W/n 20% of median depth', select_best(withins))
            for t in depth_threshs:
                reg.add_record('{}x'.format(t),
                               select_best(percents_by_threshs[t]))

            total_regions += 1

    for f in open_tsv_files:
        f.close()

    gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best')
    txt_rep_fpath = report.save_txt(
        join(output_dir, gene_report_basename + '.txt'))
    tsv_rep_fpath = report.save_tsv(
        join(output_dir, gene_report_basename + '.tsv'))
    info('')
    info('Best values for the regions (total ' + str(total_regions) +
         ') saved into:')
    info('  ' + txt_rep_fpath)

    return txt_rep_fpath
示例#29
0
def _build_report(depth_stats, reads_stats, mm_indels_stats, sample, target,
                  depth_threshs, bed_padding, sample_num, is_debug=False, reannotate=False):
    report = SampleReport(sample, metric_storage=get_header_metric_storage(depth_threshs, is_wgs=target.bed_fpath is None, padding=bed_padding))

    def _add(_metric_name, _val, url=None):
        return report.add_record(_metric_name, _val, silent=(sample_num > 1 and not is_debug), url=url)

    _add('Qualimap', 'Qualimap', url=relpath(sample.qualimap_html_fpath, sample.dirpath))
    if reads_stats.get('gender') is not None:
        _add('Sex', reads_stats['gender'])

    debug('* General coverage statistics *')
    _add('Reads', reads_stats['total'])
    _add('Mapped reads', reads_stats['mapped'])
    # _add('Unmapped reads', reads_stats['totaAvgl'] - reads_stats['mapped'])
    percent_mapped = 1.0 * (reads_stats['mapped'] or 0) / reads_stats['total'] if reads_stats['total'] else None
    if percent_mapped > 1.0: percent_mapped = 1.0
    _add('Percentage of mapped reads', percent_mapped)
    # percent_unmapped = 1.0 * (reads_stats['total'] - reads_stats['mapped']) / reads_stats['total'] if reads_stats['total'] else None
    # assert percent_unmapped <= 1.0 or percent_unmapped is None, str(percent_unmapped)
    # _add('Percentage of unmapped reads', percent_unmapped)
    if reads_stats.get('mapped_paired') is not None:
        total_paired_reads_pecent = 1.0 * (reads_stats['mapped_paired'] or 0) / reads_stats['total'] if reads_stats['total'] else None
        if total_paired_reads_pecent > 1.0: total_paired_reads_pecent = 1.0
        _add('Properly paired mapped reads percent', total_paired_reads_pecent)
    # if reads_stats.get('paired') is not None:
    #     total_paired_reads_pecent = 1.0 * (reads_stats['paired'] or 0) / reads_stats['total'] if reads_stats['total'] else None
    #     assert total_paired_reads_pecent <= 1.0 or total_paired_reads_pecent is None, str(total_paired_reads_pecent)
    #     _add('Properly paired reads percent', total_paired_reads_pecent)
    # if dedup_bam_stats:
    # dup_rate = 1 - (1.0 * dedup_bam_stats['mapped'] / bam_stats['mapped']) if bam_stats['mapped'] else None
    _add('Duplication rate', reads_stats['dup_rate'])
    # _add('Dedupped mapped reads', reads_stats['mapped'] - reads_stats[''])
    _add('Median GC', reads_stats['median_gc'])
    _add('Median insert size', reads_stats['median_ins_size'])

    debug()

    if not target.is_wgs:
        debug('* Target coverage statistics *')
        if target.original_bed_fpath:
            _add('Target', target.original_bed_fpath)
            if count_bed_cols(target.original_bed_fpath) == 3 or reannotate:
                _add('Ready target (clean, sorted and annotated)', target.capture_bed_fpath)
        else:
            _add('Target', target.capture_bed_fpath)
        _add('Bases in target', target.bases_num)
        _add('Percentage of reference', target.fraction)
        _add('Regions in target', target.regions_num)
        _add('Scope', 'targeted')
        _add('Genes in target', len(target.gene_keys_list))
    else:
        debug('* Genome coverage statistics *')
        _add('Target', 'whole genome')
        _add('Reference size', target.bases_num)
        _add('Scope', 'WGS')

    trg_type = 'target' if not target.is_wgs else 'genome'

    if 'bases_within_threshs' in depth_stats:
        bases_within_threshs = depth_stats['bases_within_threshs']
        v_covered_bases_in_targ = list(bases_within_threshs.items())[0][1]
        v_percent_covered_bases_in_targ = 1.0 * (v_covered_bases_in_targ or 0) / target.bases_num if target.bases_num else None
        if v_percent_covered_bases_in_targ > 1.0: v_percent_covered_bases_in_targ = 1.0
        _add('Covered bases in ' + trg_type, v_covered_bases_in_targ)
        _add('Percentage of ' + trg_type + ' covered by at least 1 read', v_percent_covered_bases_in_targ)

    if not target.is_wgs:
        debug('Getting number of mapped reads on target...')
        # mapped_reads_on_target = number_mapped_reads_on_target(cnf, target_info.bed, bam_fpath)
        if 'mapped_dedup_on_target' in reads_stats:
            # _add('Reads mapped on target', reads_stats['mapped_on_target'])
            debug('Unique mapped reads on target: ' + str(reads_stats['mapped_dedup_on_target']))
            percent_mapped_dedup_on_target = 1.0 * reads_stats['mapped_dedup_on_target'] / reads_stats['mapped_dedup'] if reads_stats['mapped_dedup'] != 0 else None
            if percent_mapped_dedup_on_target > 1.0: percent_mapped_dedup_on_target = 1.0
            _add('Percentage of reads mapped on target', percent_mapped_dedup_on_target)

            percent_mapped_dedup_off_target = 1.0 * (reads_stats['mapped_dedup'] - reads_stats['mapped_dedup_on_target']) / reads_stats['mapped_dedup'] if reads_stats['mapped_dedup'] != 0 else None
            if percent_mapped_dedup_off_target > 1.0: percent_mapped_dedup_off_target = 1.0
            _add('Percentage of reads mapped off target', percent_mapped_dedup_off_target)

            percent_usable = 1.0 * reads_stats['mapped_dedup_on_target'] / reads_stats['total'] if reads_stats['total'] != 0 else None
            if percent_usable > 1.0: percent_usable = 1.0  # for edge case where multimappers cause number of alignments to be higher than number of reads
            _add('Percentage of usable reads', percent_usable)

        read_bases_on_targ = int(target.bases_num * depth_stats['ave_depth'])  # sum of all coverages
        _add('Read bases mapped on target', read_bases_on_targ)

        if 'mapped_dedup_on_padded_target' in reads_stats:
            # _add('Reads mapped on padded target', reads_stats['mapped_reads_on_padded_target'])
            percent_mapped_on_padded_target = 1.0 * reads_stats['mapped_dedup_on_padded_target'] / reads_stats['mapped_dedup'] if reads_stats['mapped_dedup'] else None
            if percent_mapped_on_padded_target > 1.0: percent_mapped_on_padded_target = 1.0
            _add('Percentage of reads mapped on padded target', percent_mapped_on_padded_target)

    elif 'mapped_dedup_on_exome' in reads_stats:
        # _add('Reads mapped on target', reads_stats['mapped_on_target'])
        percent_mapped_on_exome = 1.0 * reads_stats['mapped_dedup_on_exome'] / reads_stats['mapped_dedup'] if reads_stats['mapped_dedup'] != 0 else None
        if percent_mapped_on_exome:
            if percent_mapped_on_exome > 1.0: percent_mapped_on_exome = 1.0
            _add('Percentage of reads mapped on exome', percent_mapped_on_exome)
            percent_mapped_off_exome = 1.0 - percent_mapped_on_exome
            _add('Percentage of reads mapped off exome ', percent_mapped_off_exome)

        percent_usable = 1.0 * reads_stats['mapped_dedup'] / reads_stats['total'] if reads_stats['total'] != 0 else None
        if percent_usable > 1.0: percent_usable = 1.0
        _add('Percentage of usable reads', percent_usable)

    debug()
    _add('Mean ' + trg_type + ' coverage depth', depth_stats['ave_depth'])
    if 'original_num_reads' in reads_stats:
        _add('Original reads', reads_stats['original_num_reads'])
        times_downsampled = 1.0 * reads_stats['original_num_reads'] / reads_stats['total']
        est_full_cov = times_downsampled * depth_stats['ave_depth']
        _add('Estimated ' + trg_type + ' full coverage depth', est_full_cov)
    _add('Median ' + trg_type + ' coverage depth', depth_stats['median_depth'])
    if depth_stats['median_depth'] > 0:
        _add('Std. dev. of ' + trg_type + ' coverage depth', depth_stats['stddev_depth'])
    # _add('Minimal ' + trg_type + ' coverage depth', depth_stats['min_depth'])
    # _add('Maximum ' + trg_type + ' coverage depth', depth_stats['max_depth'])
    if 'wn_20_percent' in depth_stats:
        if depth_stats['wn_20_percent'] > 1.0: depth_stats['wn_20_percent'] = 1.0
        _add('Percentage of ' + trg_type + ' within 20% of med depth', depth_stats['wn_20_percent'])

    if 'bases_within_threshs' in depth_stats:
        for depth, bases in depth_stats['bases_within_threshs'].items():
            fraction_val = 1.0 * (bases or 0) / target.bases_num if target.bases_num else 0
            if fraction_val > 1.0: fraction_val = 1.0
            if fraction_val > 0:
                _add('Part of ' + trg_type + ' covered at least by ' + str(depth) + 'x', fraction_val)
    debug()

    _add('Read mean length', reads_stats['ave_len'])
    _add('Read min length', reads_stats['min_len'])
    _add('Read max length', reads_stats['max_len'])
    _add('Mean Mapping Quality', mm_indels_stats['mean_mq'])
    _add('Mismatches', mm_indels_stats['mismatches'])
    _add('Insertions', mm_indels_stats['insertions'])
    _add('Deletions', mm_indels_stats['deletions'])
    _add('Homopolymer indels', mm_indels_stats['homo_indels'])

    debug()
    info('Saving reports...')
    report.save_json(sample.targqc_json_fpath)
    report.save_txt(sample.targqc_txt_fpath)
    report.save_html(sample.targqc_html_fpath, caption='Target coverage statistics for ' + sample.name)
    debug()
    debug('Saved to ' + dirname(report.txt_fpath))
    return report
示例#30
0
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir):
    metric_storage = get_detailed_metric_storage(depth_threshs)

    report = PerRegionSampleReport(sample='Best', metric_storage=metric_storage)
    report.add_record('Sample', 'contains best values from all samples: ' + ', '.join([s.name for s in samples]))

    total_regions = 0
    fpaths = [s.targqc_region_tsv for s in samples if verify_file(s.targqc_region_tsv)]
    if not fpaths:
        err('No targetcov detailed per-gene report was generated; skipping.')
        return None

    open_tsv_files = [open(fpath) for fpath in fpaths]

    first_col = 0
    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break
        l = lines_for_each_sample[0]
        if l.startswith('##'):
            continue
        elif l.startswith('#'):
            if l.startswith('#Sample'):
                first_col = 1
            break

    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break

        if all([not l.startswith('#') and ('Whole-Gene' in l or 'Gene-Exon' in l) for l in lines_for_each_sample]):
            shared_fields = lines_for_each_sample[0].split('\t')[first_col:first_col+9]
            reg = report.add_row()
            reg.add_record('Chr', get_val(shared_fields[0]))
            reg.add_record('Start', get_int_val(shared_fields[1]))
            reg.add_record('End', get_int_val(shared_fields[2]))
            reg.add_record('Size', get_int_val(shared_fields[3]))
            reg.add_record('Gene', get_val(shared_fields[4]))
            reg.add_record('Strand', get_val(shared_fields[5]))
            reg.add_record('Feature', get_val(shared_fields[6]))
            reg.add_record('Biotype', get_val(shared_fields[7]))
            reg.add_record('Transcript', get_val(shared_fields[8]))

            min_depths, ave_depths, stddevs, withins = ([], [], [], [])
            percents_by_threshs = {t: [] for t in depth_threshs}

            for l in lines_for_each_sample:
                fs = l.split('\t')

                min_depths.append(get_int_val(fs[first_col+9]))
                ave_depths.append(get_float_val(fs[first_col+10]))
                stddevs.append(get_float_val(fs[first_col+11]))
                withins.append(get_float_val(fs[first_col+12]))
                for t, f in zip(depth_threshs, fs[first_col+13:]):
                    percents_by_threshs[t].append(get_float_val(f))

            # counting bests
            reg.add_record('Min depth', select_best(min_depths))
            reg.add_record('Ave depth', select_best(ave_depths))
            reg.add_record('Std dev', select_best(stddevs, max))
            reg.add_record('W/n 20% of median depth', select_best(withins))
            for t in depth_threshs:
                reg.add_record('{}x'.format(t), select_best(percents_by_threshs[t]))

            total_regions += 1

    for f in open_tsv_files:
        f.close()

    gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best')
    txt_rep_fpath = report.save_txt(join(output_dir, gene_report_basename + '.txt'))
    tsv_rep_fpath = report.save_tsv(join(output_dir, gene_report_basename + '.tsv'))
    info('')
    info('Best values for the regions (total ' + str(total_regions) + ') saved into:')
    info('  ' + txt_rep_fpath)

    return txt_rep_fpath
示例#31
0
def proc_fastq(samples, parall_view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample=None, dedup=True):
    num_pairs_by_sample = num_pairs_by_sample or dict()
    if downsample_to:
        # Read pairs counts
        debug()
        if all(s.name in num_pairs_by_sample for s in samples):
            debug('Using read pairs counts extracted from FastQC reports')
        elif all(can_reuse(make_pair_counts_fpath(join(work_dir, s.name)), s.l_fpath) for s in samples):
            debug('Reusing pairs counts, reading from files')
            num_pairs_by_sample = {s.name: int(open(make_pair_counts_fpath(join(work_dir, s.name))).read().strip()) for s in samples}
        else:
            info('Counting read pairs')
            num_pairs = parall_view.run(count_read_pairs, [[s.name, safe_mkdir(join(work_dir, s.name)), s.l_fpath] for s in samples])
            num_pairs_by_sample = {s.name: pairs_count for s, pairs_count in zip(samples, num_pairs)}

        # Downsampling
        debug()
        if all(can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.l_fpath), s.l_fpath) and
               can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.r_fpath), s.r_fpath) for s in samples):
            debug('Reusing downsampled FastQ')
            for s in samples:
                s.l_fpath = make_downsampled_fpath(join(work_dir, s.name), s.l_fpath)
                s.r_fpath = make_downsampled_fpath(join(work_dir, s.name), s.r_fpath)
        else:
            if isinstance(downsample_to, float):
                info('Downsampling FastQ to ' + str(float(downsample_to)) + ' fraction of reads')
            else:
                info('Downsampling FastQ to ' + str(int(downsample_to)) + ' read pairs')
            fastq_pairs = parall_view.run(downsample,
                [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, downsample_to, num_pairs_by_sample.get(s.name)]
                 for s in samples])
            for s, (l_r, r_r) in zip(samples, fastq_pairs):
                s.l_fpath = l_r
                s.r_fpath = r_r
    else:
        info('Skipping downsampling')

    debug()
    if all(can_reuse(make_bam_fpath(join(work_dir, s.name)), [s.l_fpath, s.r_fpath]) for s in samples):
        debug('All downsampled BAM exists, reusing')
        for s in samples:
            s.bam = make_bam_fpath(join(work_dir, s.name))
    else:
        bwa = which('bwa')
        if not isfile(bwa):
            critical('BWA not found under ' + bwa)
        smb = sambamba.get_executable()
        if not (bwa and smb):
            if not bwa:         err('Error: bwa is required for the alignment pipeline')
            if not smb:         err('Error: sambamba is required for the alignment pipeline')
            critical('Tools required for alignment not found')
        info('Aligning reads to the reference')
        bam_fpaths = parall_view.run(align,
            [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb, bwa_prefix, dedup, parall_view.cores_per_job]
             for s in samples])

        bam_fpaths = [verify_bam(b) for b in bam_fpaths]
        if len(bam_fpaths) < len(samples):
            critical('Some samples were not aligned successfully.')
        for bam, s in zip(bam_fpaths, samples):
            s.bam = bam

    return num_pairs_by_sample
示例#32
0
def start_targqc(
    work_dir,
    output_dir,
    samples,
    target_bed_fpath,
    parallel_cfg,
    bwa_prefix,
    fai_fpath=None,
    genome=config.genome,
    depth_threshs=config.depth_thresholds,
    downsample_to=config.downsample_fraction,
    padding=config.padding,
    dedup=config.dedup,
    num_pairs_by_sample=None,
    reannotate=config.reannotate,
):
    d = get_description()
    info('*' * len(d))
    info(d)
    info('*' * len(d))
    info()

    fai_fpath = fai_fpath or ref.get_fai(genome)
    target = Target(work_dir,
                    output_dir,
                    fai_fpath,
                    padding=padding,
                    bed_fpath=target_bed_fpath,
                    reannotate=reannotate,
                    genome=genome,
                    is_debug=logger.is_debug)

    fastq_samples = [
        s for s in samples if not s.bam and s.l_fpath and s.r_fpath
    ]
    from targqc.utilz.parallel import parallel_view
    if fastq_samples:
        if not bwa_prefix:
            critical('--bwa-prefix is required when running from fastq')
        with parallel_view(len(fastq_samples), parallel_cfg,
                           join(work_dir, 'sge_fastq')) as view:
            num_pairs_by_sample = proc_fastq(fastq_samples,
                                             view,
                                             work_dir,
                                             bwa_prefix,
                                             downsample_to,
                                             num_pairs_by_sample,
                                             dedup=dedup)

    info()
    for s in samples:
        if s.bam:
            info(s.name + ': using alignment ' + s.bam)

    with parallel_view(len(samples), parallel_cfg, join(work_dir,
                                                        'sge_bam')) as view:
        info('Sorting BAMs...')
        sorted_bams = view.run(
            sort_bam,
            [[s.bam, safe_mkdir(join(work_dir, s.name))] for s in samples])
        for s, sorted_bam in zip(samples, sorted_bams):
            s.bam = sorted_bam

        if all(can_reuse(s.bam + '.bai', s.bam) for s in samples):
            debug('BAM indexes exists')
        else:
            info('Indexing BAMs...')
            view.run(index_bam, [[s.bam] for s in samples])

        info('Making general reports...')
        make_general_reports(view,
                             samples,
                             target,
                             genome,
                             depth_threshs,
                             padding,
                             num_pairs_by_sample,
                             is_debug=logger.is_debug,
                             reannotate=reannotate,
                             fai_fpath=fai_fpath)

    info()
    info('*' * 70)
    tsv_fpath, html_fpath = make_tarqc_html_report(output_dir,
                                                   work_dir,
                                                   samples,
                                                   bed_fpath=target_bed_fpath)
    info('TargQC summary saved in: ')
    info('  ' + html_fpath)
    info('  ' + tsv_fpath)

    info()
    with parallel_view(len(samples), parallel_cfg, join(work_dir,
                                                        'sge_bam')) as view:
        info('Making region-level reports...')
        make_region_reports(view, work_dir, samples, target, genome,
                            depth_threshs)

    info()
    info('*' * 70)
    tsv_region_rep_fpath = combined_regional_reports(work_dir, output_dir,
                                                     samples)

    info()
    info('*' * 70)
    info('TargQC summary saved in: ')
    info('  ' + html_fpath)
    info('  ' + tsv_fpath)
    info('Per-region coverage statistics saved into:')
    info('  ' + tsv_region_rep_fpath)

    return html_fpath
示例#33
0
def downsample(work_dir, sample_name, fastq_left_fpath, fastq_right_fpath, downsample_to, num_pairs=None):
    """ get N random headers from a fastq file without reading the
    whole thing into memory
    modified from: http://www.biostars.org/p/6544/
    """
    sample_name = sample_name or splitext(''.join(lc if lc == rc else '' for lc, rc in zip(fastq_left_fpath, fastq_right_fpath)))[0]

    l_out_fpath = make_downsampled_fpath(work_dir, fastq_left_fpath)
    r_out_fpath = make_downsampled_fpath(work_dir, fastq_right_fpath)
    if can_reuse(l_out_fpath, [fastq_left_fpath, fastq_right_fpath]):
        return l_out_fpath, r_out_fpath

    info('Processing ' + sample_name)
    if num_pairs is None:
        info(sample_name + ': counting number of reads in fastq...')
        num_pairs = _count_records_in_fastq(fastq_left_fpath)
    if num_pairs > LIMIT:
        info(sample_name + ' the number of reads is higher than ' + str(LIMIT) +
             ', sampling from only first ' + str(LIMIT))
        num_pairs = LIMIT
    info(sample_name + ': ' + str(num_pairs) + ' reads')
    num_downsample_pairs = int(downsample_to * num_pairs) if isinstance(downsample_to, float) else downsample_to
    if num_pairs <= num_downsample_pairs:
        info(sample_name + ': and it is less than ' + str(num_downsample_pairs) + ', so no downsampling.')
        return fastq_left_fpath, fastq_right_fpath
    else:
        info(sample_name + ': downsampling to ' + str(num_downsample_pairs))
        rand_records = sorted(random.sample(range(num_pairs), num_downsample_pairs))

    info('Opening ' + fastq_left_fpath)
    fh1 = open_gzipsafe(fastq_left_fpath)
    info('Opening ' + fastq_right_fpath)
    fh2 = open_gzipsafe(fastq_right_fpath) if fastq_right_fpath else None

    out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath,)

    written_records = 0
    with file_transaction(work_dir, out_files) as tx_out_files:
        if isinstance(tx_out_files, six.string_types):
            tx_out_f1 = tx_out_files
        else:
            tx_out_f1, tx_out_f2 = tx_out_files
        info('Opening ' + str(tx_out_f1) + ' to write')
        sub1 = open_gzipsafe(tx_out_f1, "w")
        info('Opening ' + str(tx_out_f2) + ' to write')
        sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None
        rec_no = -1
        for rr in rand_records:
            while rec_no < rr:
                rec_no += 1
                for i in range(4): fh1.readline()
                if fh2:
                    for i in range(4): fh2.readline()
            for i in range(4):
                sub1.write(fh1.readline())
                if sub2:
                    sub2.write(fh2.readline())
            written_records += 1
            if written_records % 10000 == 0:
                info(sample_name + ': written ' + str(written_records) + ', rec_no ' + str(rec_no + 1))
            if rec_no > num_pairs:
                info(sample_name + ' reached the limit of ' + str(num_pairs), ' read lines, stopping.')
                break
        info(sample_name + ': done, written ' + str(written_records) + ', rec_no ' + str(rec_no))
        fh1.close()
        sub1.close()
        if fastq_right_fpath:
            fh2.close()
            sub2.close()

    info(sample_name + ': done downsampling, saved to ' + l_out_fpath + ' and ' + r_out_fpath + ', total ' + str(written_records) + ' paired reads written')
    return l_out_fpath, r_out_fpath
示例#34
0
 def _try_run(_cmd, _output_fpath, _input_fpath):
     try:
         info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, six.string_types) else _cmd)
         _do_run(_cmd, checks, env, _output_fpath, _input_fpath)
     except:
         raise
示例#35
0
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None,
             reannotate=True, high_confidence=False, only_canonical=False,
             coding_only=False, short=False, extended=False, is_debug=False, **kwargs):

    debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES))

    if genome:
        fai_fpath = reference_data.get_fai(genome)
        chr_order = reference_data.get_chrom_order(genome)
    else:
        fai_fpath = None
        chr_order = bed_chrom_order(input_bed_fpath)

    input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome)

    ori_bed = BedTool(input_bed_fpath)
    ori_col_num = ori_bed.field_count()
    reannotate = reannotate or ori_col_num == 3
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    ori_bed = BedTool(input_bed_fpath)
        # if reannotate:
        #     bed = BedTool(input_bed_fpath).cut([0, 1, 2])
        #     keep_gene_column = False
        # else:
        #     if col_num > 4:
        #         bed = BedTool(input_bed_fpath).cut([0, 1, 2, 3])
        #     keep_gene_column = True

    # features_bed = features_bed.saveas()
    # cols = features_bed.field_count()
    # if cols < 12:
    #     features_bed = features_bed.each(lambda f: f + ['.']*(12-cols))
    if high_confidence:
        features_bed = features_bed.filter(ebl.high_confidence_filter)
    if only_canonical:
        features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome))
    if coding_only:
        features_bed = features_bed.filter(ebl.protein_coding_filter)
    # unique_tx_by_gene = find_best_tx_by_gene(features_bed)

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x:
        x[ebl.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript'])
        # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]])

    info('Overlapping regions with Ensembl data')
    if is_debug:
        ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed'))
        debug(f'Saved regions to {ori_bed.fn}')
        features_bed = features_bed.saveas(join(work_dir, 'features.bed'))
        debug(f'Saved features to {features_bed.fn}')
    annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num,
                          high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs)

    full_header = [ebl.BedCols.names[i] for i in ebl.BedCols.cols]
    add_ori_extra_fields = ori_col_num > 3
    if not reannotate and ori_col_num == 4:
        add_ori_extra_fields = False  # no need to report the original gene field if we are not re-annotating

    info('Saving annotated regions...')
    total = 0
    with file_transaction(work_dir, output_fpath) as tx:
        with open(tx, 'w') as out:
            header = full_header[:6]
            if short:
                header = full_header[:4]
            if extended:
                header = full_header[:-1]
            if add_ori_extra_fields:
                header.append(full_header[-1])

            if extended:
                out.write('## ' + ebl.BedCols.names[ebl.BedCols.TX_OVERLAP_PERCENTAGE] +
                          ': part of region overlapping with transcripts\n')
                out.write('## ' + ebl.BedCols.names[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with exons\n')
                out.write('## ' + ebl.BedCols.names[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with protein coding regions\n')
                out.write('\t'.join(header) + '\n')
            for full_fields in annotated:
                fields = full_fields[:6]
                if short:
                    fields = full_fields[:4]
                if extended:
                    fields = full_fields[:-1]
                if add_ori_extra_fields:
                    fields.append(full_fields[-1])

                out.write('\t'.join(map(_format_field, fields)) + '\n')
                total += 1
    
    debug('Saved ' + str(total) + ' total annotated regions')
    return output_fpath
示例#36
0
def downsample(work_dir,
               sample_name,
               fastq_left_fpath,
               fastq_right_fpath,
               downsample_to,
               num_pairs=None):
    """ get N random headers from a fastq file without reading the
    whole thing into memory
    modified from: http://www.biostars.org/p/6544/
    """
    sample_name = sample_name or splitext(''.join(
        lc if lc == rc else ''
        for lc, rc in zip(fastq_left_fpath, fastq_right_fpath)))[0]

    l_out_fpath = make_downsampled_fpath(work_dir, fastq_left_fpath)
    r_out_fpath = make_downsampled_fpath(work_dir, fastq_right_fpath)
    if can_reuse(l_out_fpath, [fastq_left_fpath, fastq_right_fpath]):
        return l_out_fpath, r_out_fpath

    info('Processing ' + sample_name)
    if num_pairs is None:
        info(sample_name + ': counting number of reads in fastq...')
        num_pairs = _count_records_in_fastq(fastq_left_fpath)
    if num_pairs > LIMIT:
        info(sample_name + ' the number of reads is higher than ' +
             str(LIMIT) + ', sampling from only first ' + str(LIMIT))
        num_pairs = LIMIT
    info(sample_name + ': ' + str(num_pairs) + ' reads')
    num_downsample_pairs = int(downsample_to * num_pairs) if isinstance(
        downsample_to, float) else downsample_to
    if num_pairs <= num_downsample_pairs:
        info(sample_name + ': and it is less than ' +
             str(num_downsample_pairs) + ', so no downsampling.')
        return fastq_left_fpath, fastq_right_fpath
    else:
        info(sample_name + ': downsampling to ' + str(num_downsample_pairs))
        rand_records = sorted(
            random.sample(range(num_pairs), num_downsample_pairs))

    info('Opening ' + fastq_left_fpath)
    fh1 = open_gzipsafe(fastq_left_fpath)
    info('Opening ' + fastq_right_fpath)
    fh2 = open_gzipsafe(fastq_right_fpath) if fastq_right_fpath else None

    out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath, )

    written_records = 0
    with file_transaction(work_dir, out_files) as tx_out_files:
        if isinstance(tx_out_files, six.string_types):
            tx_out_f1 = tx_out_files
        else:
            tx_out_f1, tx_out_f2 = tx_out_files
        info('Opening ' + str(tx_out_f1) + ' to write')
        sub1 = open_gzipsafe(tx_out_f1, "w")
        info('Opening ' + str(tx_out_f2) + ' to write')
        sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None
        rec_no = -1
        for rr in rand_records:
            while rec_no < rr:
                rec_no += 1
                for i in range(4):
                    fh1.readline()
                if fh2:
                    for i in range(4):
                        fh2.readline()
            for i in range(4):
                sub1.write(fh1.readline())
                if sub2:
                    sub2.write(fh2.readline())
            written_records += 1
            if written_records % 10000 == 0:
                info(sample_name + ': written ' + str(written_records) +
                     ', rec_no ' + str(rec_no + 1))
            if rec_no > num_pairs:
                info(sample_name + ' reached the limit of ' + str(num_pairs),
                     ' read lines, stopping.')
                break
        info(sample_name + ': done, written ' + str(written_records) +
             ', rec_no ' + str(rec_no))
        fh1.close()
        sub1.close()
        if fastq_right_fpath:
            fh2.close()
            sub2.close()

    info(sample_name + ': done downsampling, saved to ' + l_out_fpath +
         ' and ' + r_out_fpath + ', total ' + str(written_records) +
         ' paired reads written')
    return l_out_fpath, r_out_fpath
示例#37
0
def check_md5(work_dir, fpath, file_type, silent=False):
    md5_fpath = join(work_dir, file_type + '_md5.txt')
    new_md5 = md5(fpath)
    info('md5 of ' + fpath + ' is ' + str(new_md5))
    prev_md5 = None
    if isfile(md5_fpath):
        with open(md5_fpath) as f:
            prev_md5 = f.read()
    else:
        info('Previous md5 file ' + md5_fpath + ' does not exist')
    info('Checking previous md5 from ' + md5_fpath + ': ' + str(prev_md5))

    if prev_md5 == new_md5:
        if not silent:
            debug('Reusing previous ' + file_type.upper() + ' files.')
        return True
    else:
        if not silent:
            info('Pre-processing input ' + file_type.upper() + ' file')
        if prev_md5:
            if not silent:
                info('Prev ' + file_type.upper() + ' md5: ' + str(prev_md5))
                info('New ' + file_type.upper() + ' md5: ' + str(new_md5))

        with open(md5_fpath, 'w') as f:
            f.write(str(new_md5))
        return False
示例#38
0
def proc_fastq(samples,
               parall_view,
               work_dir,
               bwa_prefix,
               downsample_to,
               num_pairs_by_sample=None,
               dedup=True):
    num_pairs_by_sample = num_pairs_by_sample or dict()
    if downsample_to:
        # Read pairs counts
        debug()
        if all(s.name in num_pairs_by_sample for s in samples):
            debug('Using read pairs counts extracted from FastQC reports')
        elif all(
                can_reuse(make_pair_counts_fpath(join(work_dir, s.name)),
                          s.l_fpath) for s in samples):
            debug('Reusing pairs counts, reading from files')
            num_pairs_by_sample = {
                s.name: int(
                    open(make_pair_counts_fpath(join(work_dir,
                                                     s.name))).read().strip())
                for s in samples
            }
        else:
            info('Counting read pairs')
            num_pairs = parall_view.run(
                count_read_pairs,
                [[s.name,
                  safe_mkdir(join(work_dir, s.name)), s.l_fpath]
                 for s in samples])
            num_pairs_by_sample = {
                s.name: pairs_count
                for s, pairs_count in zip(samples, num_pairs)
            }

        # Downsampling
        debug()
        if all(
                can_reuse(
                    make_downsampled_fpath(join(work_dir, s.name), s.l_fpath),
                    s.l_fpath) and can_reuse(
                        make_downsampled_fpath(join(work_dir, s.name),
                                               s.r_fpath), s.r_fpath)
                for s in samples):
            debug('Reusing downsampled FastQ')
            for s in samples:
                s.l_fpath = make_downsampled_fpath(join(work_dir, s.name),
                                                   s.l_fpath)
                s.r_fpath = make_downsampled_fpath(join(work_dir, s.name),
                                                   s.r_fpath)
        else:
            if isinstance(downsample_to, float):
                info('Downsampling FastQ to ' + str(float(downsample_to)) +
                     ' fraction of reads')
            else:
                info('Downsampling FastQ to ' + str(int(downsample_to)) +
                     ' read pairs')
            fastq_pairs = parall_view.run(downsample, [[
                join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath,
                downsample_to,
                num_pairs_by_sample.get(s.name)
            ] for s in samples])
            for s, (l_r, r_r) in zip(samples, fastq_pairs):
                s.l_fpath = l_r
                s.r_fpath = r_r
    else:
        info('Skipping downsampling')

    debug()
    if all(
            can_reuse(make_bam_fpath(join(work_dir, s.name)),
                      [s.l_fpath, s.r_fpath]) for s in samples):
        debug('All downsampled BAM exists, reusing')
        for s in samples:
            s.bam = make_bam_fpath(join(work_dir, s.name))
    else:
        bwa = which('bwa')
        if not isfile(bwa):
            critical('BWA not found under ' + bwa)
        smb = sambamba.get_executable()
        if not (bwa and smb):
            if not bwa:
                err('Error: bwa is required for the alignment pipeline')
            if not smb:
                err('Error: sambamba is required for the alignment pipeline')
            critical('Tools required for alignment not found')
        info('Aligning reads to the reference')
        bam_fpaths = parall_view.run(align, [[
            join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb,
            bwa_prefix, dedup, parall_view.cores_per_job
        ] for s in samples])

        bam_fpaths = [verify_bam(b) for b in bam_fpaths]
        if len(bam_fpaths) < len(samples):
            critical('Some samples were not aligned successfully.')
        for bam, s in zip(bam_fpaths, samples):
            s.bam = bam

    return num_pairs_by_sample
示例#39
0
def annotate(input_bed_fpath,
             output_fpath,
             work_dir,
             genome=None,
             reannotate=True,
             high_confidence=False,
             only_canonical=False,
             coding_only=False,
             short=False,
             extended=False,
             is_debug=False,
             **kwargs):

    debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' +
                 ', '.join(ebl.SUPPORTED_GENOMES))

    if genome:
        fai_fpath = reference_data.get_fai(genome)
        chr_order = reference_data.get_chrom_order(genome)
    else:
        fai_fpath = None
        chr_order = bed_chrom_order(input_bed_fpath)

    input_bed_fpath = sort_bed(input_bed_fpath,
                               work_dir=work_dir,
                               chr_order=chr_order,
                               genome=genome)

    ori_bed = BedTool(input_bed_fpath)
    ori_col_num = ori_bed.field_count()
    reannotate = reannotate or ori_col_num == 3
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    ori_bed = BedTool(input_bed_fpath)
    # if reannotate:
    #     bed = BedTool(input_bed_fpath).cut([0, 1, 2])
    #     keep_gene_column = False
    # else:
    #     if col_num > 4:
    #         bed = BedTool(input_bed_fpath).cut([0, 1, 2, 3])
    #     keep_gene_column = True

    # features_bed = features_bed.saveas()
    # cols = features_bed.field_count()
    # if cols < 12:
    #     features_bed = features_bed.each(lambda f: f + ['.']*(12-cols))
    if high_confidence:
        features_bed = features_bed.filter(ebl.high_confidence_filter)
    if only_canonical:
        features_bed = features_bed.filter(
            ebl.get_only_canonical_filter(genome))
    if coding_only:
        features_bed = features_bed.filter(ebl.protein_coding_filter)
    # unique_tx_by_gene = find_best_tx_by_gene(features_bed)

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x: x[
        ebl.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript'])
    # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]])

    info('Overlapping regions with Ensembl data')
    if is_debug:
        ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed'))
        debug(f'Saved regions to {ori_bed.fn}')
        features_bed = features_bed.saveas(join(work_dir, 'features.bed'))
        debug(f'Saved features to {features_bed.fn}')
    annotated = _annotate(ori_bed,
                          features_bed,
                          chr_order,
                          fai_fpath,
                          work_dir,
                          ori_col_num,
                          high_confidence=False,
                          reannotate=reannotate,
                          is_debug=is_debug,
                          **kwargs)

    full_header = [ebl.BedCols.names[i] for i in ebl.BedCols.cols]
    add_ori_extra_fields = ori_col_num > 3
    if not reannotate and ori_col_num == 4:
        add_ori_extra_fields = False  # no need to report the original gene field if we are not re-annotating

    info('Saving annotated regions...')
    total = 0
    with file_transaction(work_dir, output_fpath) as tx:
        with open(tx, 'w') as out:
            header = full_header[:6]
            if short:
                header = full_header[:4]
            if extended:
                header = full_header[:-1]
            if add_ori_extra_fields:
                header.append(full_header[-1])

            if extended:
                out.write(
                    '## ' +
                    ebl.BedCols.names[ebl.BedCols.TX_OVERLAP_PERCENTAGE] +
                    ': part of region overlapping with transcripts\n')
                out.write(
                    '## ' +
                    ebl.BedCols.names[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] +
                    ': part of region overlapping with exons\n')
                out.write(
                    '## ' +
                    ebl.BedCols.names[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] +
                    ': part of region overlapping with protein coding regions\n'
                )
                out.write('\t'.join(header) + '\n')
            for full_fields in annotated:
                fields = full_fields[:6]
                if short:
                    fields = full_fields[:4]
                if extended:
                    fields = full_fields[:-1]
                if add_ori_extra_fields:
                    fields.append(full_fields[-1])

                out.write('\t'.join(map(_format_field, fields)) + '\n')
                total += 1

    debug('Saved ' + str(total) + ' total annotated regions')
    return output_fpath
示例#40
0
def _annotate(bed, ref_bed, chr_order, fai_fpath, work_dir, ori_col_num,
              high_confidence=False, reannotate=False, is_debug=False, **kwargs):
    # if genome:
        # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2'))
        # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')')
        # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0])
    # else:

    intersection_bed = None
    intersection_fpath = None
    
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    if is_debug:
        intersection_fpath = join(work_dir, 'intersection.bed')
        if isfile(intersection_fpath):
            info('Loading from ' + intersection_fpath)
            intersection_bed = BedTool(intersection_fpath)
    if not intersection_bed:
        if count_bed_cols(fai_fpath) == 2:
            debug('Fai fields size is 2 ' + fai_fpath)
            intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath)
        else:
            debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) + ', not 2')
            intersection_bed = bed.intersect(ref_bed, wao=True)
    if is_debug and not isfile(intersection_fpath):
        intersection_bed.saveas(intersection_fpath)
        debug('Saved intersection to ' + intersection_fpath)

    total_annotated = 0
    total_uniq_annotated = 0
    total_off_target = 0

    met = set()

    overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict(lambda: OrderedDefaultDict(lambda: defaultdict(list)))
    # off_targets = list()

    expected_fields_num = ori_col_num + len(ebl.BedCols.cols[:-4]) + 1
    for i, intersection_fields in enumerate(intersection_bed):
        inters_fields_list = list(intersection_fields)
        if len(inters_fields_list) < expected_fields_num:
            critical(f'Cannot parse the reference BED file - unexpected number of lines '
                     '({len(inters_fields_list} in {inters_fields_list}' +
                     ' (less than {expected_fields_num})')

        a_chr, a_start, a_end = intersection_fields[:3]
        a_extra_columns = intersection_fields[3:ori_col_num]

        overlap_fields = [None for _ in ebl.BedCols.cols]

        overlap_fields[:len(intersection_fields[ori_col_num:-1])] = intersection_fields[ori_col_num:-1]
        keep_gene_column = not reannotate
        a_gene = None
        if keep_gene_column:
            a_gene = a_extra_columns[0]

        e_chr = overlap_fields[0]
        overlap_size = int(intersection_fields[-1])
        assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}'

        # fs = [None for _ in ebl.BedCols.cols]
        # fs[:3] = [a_chr, a_start, a_end]
        reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns))

        if e_chr == '.':
            total_off_target += 1
            # off_targets.append(fs)
            overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list)

        else:
            # fs[3:-1] = db_feature_fields[3:-1]
            total_annotated += 1
            if (a_chr, a_start, a_end) not in met:
                total_uniq_annotated += 1
                met.add((a_chr, a_start, a_end))

            e_gene = overlap_fields[ebl.BedCols.GENE]
            if keep_gene_column and e_gene != a_gene:
                overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list)
            else:
                transcript_id = overlap_fields[ebl.BedCols.ENSEMBL_ID]
                overlaps_by_tx_by_gene_by_loc[reg][e_gene][transcript_id].append((overlap_fields, overlap_size))

    info('  Total annotated regions: ' + str(total_annotated))
    info('  Total unique annotated regions: ' + str(total_uniq_annotated))
    info('  Total off target regions: ' + str(total_off_target))
    info('Resolving ambiguities...')
    annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order, **kwargs)

    return annotated
示例#41
0
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome +
             ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug(
                'The male non-PAR region does not overlap with the capture target - cannot determine sex.'
            )
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info(
        'Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.'
    )
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' +
              str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
              ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' +
                  str(FEMALE_Y_COVERAGE_FACTOR) +
                  ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' +
                  str(FEMALE_Y_COVERAGE_FACTOR) +
                  ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex
示例#42
0
def determine_sex(work_dir, bam_fpath, ave_depth, genome, target_bed=None):
    info()
    info('Determining sex')

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome + ', cannot identify sex')
        return None

    male_area_size = male_bed.count()
    info('Male region total size: ' + str(male_area_size))

    if target_bed:
        male_bed = BedTool(target_bed).intersect(male_bed).merge()
        target_male_area_size = male_bed.count()
        if target_male_area_size < male_area_size * MALE_TARGET_REGIONS_FACTOR:
            info('Target male region total size is ' + str(target_male_area_size) + ', which is less than the ' +
                 'checked male regions size * ' + str(MALE_TARGET_REGIONS_FACTOR) +
                 ' (' + str(male_area_size * MALE_TARGET_REGIONS_FACTOR) + ') - cannot determine sex')
            return None
        else:
            info('Target male region total size is ' + str(target_male_area_size) + ', which is higher than the ' +
                 'checked male regions size * ' + str(MALE_TARGET_REGIONS_FACTOR) +
                 ' (' + str(male_area_size * MALE_TARGET_REGIONS_FACTOR) + '). ' +
                 'Determining sex based on coverage in those regions.')
    else:
        info('WGS, determining sex based on chrY key regions coverage.')

    info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.')
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_cov_output_fpath = sambamba_depth(work_dir, male_bed, bam_fpath, [])
    chry_mean_coverage = get_mean_cov(chry_cov_output_fpath)
    info('Y key regions average depth: ' + str(chry_mean_coverage))
    ave_depth = float(ave_depth)
    info('Sample average depth: ' + str(ave_depth))
    if ave_depth < AVE_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        info('Sample average depth is too low (less then ' + str(AVE_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
             ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        info('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = ave_depth / chry_mean_coverage
        info('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            info('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            info('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male')
            sex = 'M'
    info('Sex is ' + sex)
    info()
    return sex