Exemplo n.º 1
0
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs):
    gzipped_fpath = join(fpath + '.gz')
    tbi_fpath = gzipped_fpath + '.tbi'

    if reuse and \
           file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \
           file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath):
        info('Actual compressed file and index exist, reusing')
        return gzipped_fpath

    info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)')
    bgzip = which('bgzip')
    tabix = which('tabix')
    if not bgzip:
        err('Cannot index file because bgzip is not found')
    if not tabix:
        err('Cannot index file because tabix is not found')
    if not bgzip and not tabix:
        return fpath

    if isfile(gzipped_fpath):
        os.remove(gzipped_fpath)
    if isfile(tbi_fpath):
        os.remove(tbi_fpath)

    info('BGzipping ' + fpath)
    cmdline = '{bgzip} {fpath}'.format(**locals())
    call_process.run(cmdline)

    info('Tabixing ' + gzipped_fpath)
    cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals())
    call_process.run(cmdline)

    return gzipped_fpath
Exemplo n.º 2
0
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs):
    gzipped_fpath = join(fpath + '.gz')
    tbi_fpath = gzipped_fpath + '.tbi'

    if reuse and \
           file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \
           file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath):
        info('Actual compressed file and index exist, reusing')
        return gzipped_fpath

    info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)')
    bgzip = which('bgzip')
    tabix = which('tabix')
    if not bgzip:
        err('Cannot index file because bgzip is not found')
    if not tabix:
        err('Cannot index file because tabix is not found')
    if not bgzip and not tabix:
        return fpath

    if isfile(gzipped_fpath):
        os.remove(gzipped_fpath)
    if isfile(tbi_fpath):
        os.remove(tbi_fpath)

    info('BGzipping ' + fpath)
    cmdline = '{bgzip} {fpath}'.format(**locals())
    call_process.run(cmdline)

    info('Tabixing ' + gzipped_fpath)
    cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals())
    call_process.run(cmdline)

    return gzipped_fpath
Exemplo n.º 3
0
def sort_bed_gsort(input_bed_fpath,
                   output_bed_fpath=None,
                   work_dir=None,
                   fai_fpath=None,
                   genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()),
            output_fpath=tx)

    return output_bed_fpath
Exemplo n.º 4
0
def build_snps_panel(bcbio_projs=None, bed_files=None, output_dir=None, genome=None):
    selected_snps_file = join(output_dir, 'snps.bed')
    if can_reuse(selected_snps_file, bed_files):
        return selected_snps_file

    work_dir = safe_mkdir(join(output_dir, 'work'))

    log.info('Intersecting BED files for projects.')
    all_bed_files = set()
    for proj in bcbio_projs or []:
        if proj.coverage_bed:
            log.info(proj.project_name + ': selecting ' + proj.coverage_bed)
            all_bed_files.add(proj.coverage_bed)
        else:
            all_bed_files.add(proj.call)
    all_bed_files |= set(bed_files or [])


    overlapped_bed = join(work_dir, 'merged_bed_files.bed')
    log.info(f'BED files: {all_bed_files}, mergin, writing {overlapped_bed}')
    overlap_bed_files(all_bed_files, overlapped_bed)

    # Selecting SNPs from dbSNP
    dbsnp_file = get_dbsnp(genome)
    dbsnp_snps_file = join(work_dir, 'snps_in_merged_bed_files.bed')
    if not can_reuse(dbsnp_snps_file, [dbsnp_file, overlapped_bed]):
        cmdl = f'bedtools intersect -header -a {dbsnp_file} -b {overlapped_bed}'
        call_process.run(cmdl, dbsnp_snps_file)

    subset_bed_file = add_suffix(dbsnp_snps_file, 'subset')
    _make_snp_file(dbsnp_snps_file, genome, subset_bed_file)

    shutil.copyfile(subset_bed_file, selected_snps_file)
    return selected_snps_file
Exemplo n.º 5
0
def cut(fpath, col_num, output_fpath=None):
    output_fpath = output_fpath or add_suffix(fpath, 'cut')
    if can_reuse(output_fpath, fpath):
        return output_fpath
    cmdline = 'cut -f' + ','.join(map(str, range(1, col_num + 1))) + ' ' + fpath + ' > ' + output_fpath
    call_process.run(cmdline, output_fpath=output_fpath)
    return output_fpath
Exemplo n.º 6
0
def lift_over(fpath, from_genome, to_genome):
    chain_file = join(dirname(__file__), 'over.chain', f'{from_genome}To{to_genome.title()}.over.chain.gz')
    if not verify_file(chain_file):
        log.critical(f'Error: conversion from {from_genome} to {to_genome} is not supported!')
    out_fpath = add_suffix(fpath, to_genome)
    call_process.run(f'liftOver {fpath} {chain_file} {out_fpath} {out_fpath}.unMapped')
    return out_fpath
Exemplo n.º 7
0
def get_padded_bed_file(work_dir, bed, padding, fai_fpath):
    genome_fpath = fai_fpath
    info('Making bed file for padded regions...')
    bedtools = which('bedtools')
    cmdline = '{bedtools} slop -i {bed} -g {genome_fpath} -b {padding}'.format(**locals())
    output_fpath = intermediate_fname(work_dir, bed, 'padded')
    call_process.run(cmdline, output_fpath=output_fpath)
    return output_fpath
Exemplo n.º 8
0
def cut(fpath, col_num, output_fpath=None):
    output_fpath = output_fpath or add_suffix(fpath, 'cut')
    if can_reuse(output_fpath, fpath):
        return output_fpath
    cmdline = 'cut -f' + ','.join(map(str, range(
        1, col_num + 1))) + ' ' + fpath + ' > ' + output_fpath
    call_process.run(cmdline, output_fpath=output_fpath)
    return output_fpath
Exemplo n.º 9
0
def get_padded_bed_file(work_dir, bed, padding, fai_fpath):
    genome_fpath = fai_fpath
    info('Making bed file for padded regions...')
    bedtools = which('bedtools')
    cmdline = '{bedtools} slop -i {bed} -g {genome_fpath} -b {padding}'.format(
        **locals())
    output_fpath = intermediate_fname(work_dir, bed, 'padded')
    call_process.run(cmdline, output_fpath=output_fpath)
    return output_fpath
Exemplo n.º 10
0
def overlap_bed_files(bed_files, output_bed_file):
    if can_reuse(output_bed_file, bed_files):
        return output_bed_file
    if len(bed_files) == 1:
        shutil.copy(bed_files.pop(), output_bed_file)
        return output_bed_file
    cmdl = 'bedops --intersect' + ''.join([' <(sort-bed ' + bf + ')' for bf in bed_files])
    call_process.run(cmdl, output_bed_file)
    return output_bed_file
Exemplo n.º 11
0
def intersect_bed(work_dir, bed1, bed2):
    bed1_fname, _ = splitext_plus(basename(bed1))
    bed2_fname, _ = splitext_plus(basename(bed2))
    output_fpath = join(work_dir, bed1_fname + '__' + bed2_fname + '.bed')
    if can_reuse(output_fpath, [bed1, bed2]):
        return output_fpath
    bedtools = which('bedtools')
    cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals())
    call_process.run(cmdline, output_fpath=output_fpath, checks=[call_process.file_exists_check])
    return output_fpath
Exemplo n.º 12
0
def call_sambamba(cmdl,
                  bam_fpath,
                  output_fpath=None,
                  command_name='',
                  no_index=False):
    if not no_index:
        index_bam(bam_fpath)
    sambamba = get_executable()
    run(sambamba + ' ' + cmdl, output_fpath=output_fpath)
    return output_fpath
Exemplo n.º 13
0
def bam_to_bed(bam_fpath, to_gzip=True):
    debug('Converting the BAM to BED to save some memory.')  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed')
    if can_reuse(bam_bed_fpath, bam_fpath):
        return bam_bed_fpath
    bedtools = which('bedtools')
    gzip = which('gzip')
    cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals())
    cmdline += ' | {gzip}'.format(**locals()) if to_gzip else ''
    call_process.run(cmdline, output_fpath=bam_bed_fpath)
    return bam_bed_fpath
Exemplo n.º 14
0
def intersect_bed(work_dir, bed1, bed2):
    bed1_fname, _ = splitext_plus(basename(bed1))
    bed2_fname, _ = splitext_plus(basename(bed2))
    output_fpath = join(work_dir, bed1_fname + '__' + bed2_fname + '.bed')
    if can_reuse(output_fpath, [bed1, bed2]):
        return output_fpath
    bedtools = which('bedtools')
    cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals())
    call_process.run(cmdline,
                     output_fpath=output_fpath,
                     checks=[call_process.file_exists_check])
    return output_fpath
Exemplo n.º 15
0
def bam_to_bed(bam_fpath, to_gzip=True):
    debug(
        'Converting the BAM to BED to save some memory.'
    )  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz'
                                                   if to_gzip else '.bed')
    if can_reuse(bam_bed_fpath, bam_fpath):
        return bam_bed_fpath
    bedtools = which('bedtools')
    gzip = which('gzip')
    cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals())
    cmdline += ' | {gzip}'.format(**locals()) if to_gzip else ''
    call_process.run(cmdline, output_fpath=bam_bed_fpath)
    return bam_bed_fpath
Exemplo n.º 16
0
def annotate_target(work_dir, target_bed, genome_build):
    output_fpath = intermediate_fname(work_dir, target_bed, 'ann')
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    bed_annotation = which('annotate_bed.py')
    if not bed_annotation:
        bed_annotation = which('bed_annotation')
        critical('Error: bed_annotation not found in PATH, please install `conda install -c vladsaveliev bed_annotation`.')

    cmdline = '{bed_annotation} {target_bed} -g {genome_build} -o {output_fpath}'.format(**locals())
    run(cmdline, output_fpath, stdout_to_outputfile=False)
    output_fpath = clean_bed(output_fpath, work_dir)
    return output_fpath
Exemplo n.º 17
0
def _slice_vcf_fn(work_dir, label, vcf_file, overlapped_bed):
    sliced_vcf_file = join(work_dir, label + '.sliced.vcf')
    if not can_reuse(sliced_vcf_file, [vcf_file]):
        run(f'bcftools view {vcf_file} --targets-file {overlapped_bed} -o {sliced_vcf_file}'
            )

    # ann_vcf_file = join(work_dir, label + '.sliced.ann.vcf')
    # if not can_reuse(ann_vcf_file, [sliced_vcf_file]):
    #     vcf_header = join(work_dir, label + '.vcf_header')
    #     with open(vcf_header, 'w') as f:
    #         f.write('##INFO=<ID=CHROM,Number=1,Type=String,Description="Region chromosome">\n')
    #         f.write('##INFO=<ID=FROM,Number=1,Type=String,Description="Region start">\n')
    #         f.write('##INFO=<ID=TO,Number=1,Type=String,Description="Region end">\n')
    #     run(f'bcftools annotate -c CHROM,FROM,TO -a {overlapped_bed} {sliced_vcf_file} '
    #         f'-h {vcf_header} -o {ann_vcf_file}')

    return label, sliced_vcf_file
Exemplo n.º 18
0
def annotate_target(work_dir, target_bed, genome_build):
    output_fpath = intermediate_fname(work_dir, target_bed, 'ann')
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    bed_annotation = which('annotate_bed.py')
    if not bed_annotation:
        bed_annotation = which('bed_annotation')
        critical(
            'Error: bed_annotation not found in PATH, please install `conda install -c vladsaveliev bed_annotation`.'
        )

    cmdline = '{bed_annotation} {target_bed} -g {genome_build} -o {output_fpath}'.format(
        **locals())
    run(cmdline, output_fpath, stdout_to_outputfile=False)
    output_fpath = clean_bed(output_fpath, work_dir)
    return output_fpath
Exemplo n.º 19
0
def index_bam(bam_fpath, sambamba=None, samtools=None):
    sambamba = sambamba or get_executable()
    indexed_bam = bam_fpath + '.bai'
    if not can_reuse(indexed_bam, cmp_f=bam_fpath, silent=True):
        cmdline = '{sambamba} index {bam_fpath}'.format(**locals())
        res = run(cmdline,
                  output_fpath=indexed_bam,
                  stdout_to_outputfile=False,
                  stdout_tx=False)
Exemplo n.º 20
0
def ungzip_if_needed(cnf, fpath, silent=False):
    if fpath.endswith('.gz'):
        fpath = fpath[:-3]
    if not file_exists(fpath) and file_exists(fpath + '.gz'):
        gz_fpath = fpath + '.gz'
        cmdline = 'gunzip -c {gz_fpath}'.format(**locals())
        res = run(cmdline, output_fpath=fpath)
        if not silent: info()
        if not res:
            return None
    return fpath
Exemplo n.º 21
0
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx)

    return output_bed_fpath
Exemplo n.º 22
0
def _calculate(bam_file, work_dir, genome_fasta_file, min_depth):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    output_prefix = os.path.join(work_dir, bam_samplename(bam_file))
    callability_annotation_file = output_prefix + '.callable.bed'
    if not can_reuse(callability_annotation_file, bam_file):
        info(f'Calculating coverage at {bam_file}')
        run(f'goleft depth --q 1 --mincov {min_depth} --reference {genome_fasta_file} --ordered'
            f' --prefix {output_prefix} {bam_file}')

    callable_file = output_prefix + '.callable.CALLABLE.bed'
    if not can_reuse(callable_file, callability_annotation_file):
        with file_transaction(None, callable_file) as tx:
            pybedtools.BedTool(callability_annotation_file)\
                .filter(lambda x: x.name == 'CALLABLE')\
                .saveas(tx)

    return callable_file
Exemplo n.º 23
0
def build_tree(run):
    info('Writing fasta to ' + run.fasta_file_path())
    samples = [s for p in run.projects for s in p.samples]
    with open(run.fasta_file_path(), 'w') as fhw:
        for s in samples:
            snps_by_rsid = s.snps_from_run(run)
            fhw.write('>' + s.long_name() + '\n')
            fhw.write(''.join(snps_by_rsid[loc.rsid].get_gt()
                              for loc in run.locations.all()) + '\n')
    info('All fasta saved to ' + run.fasta_file_path())

    info()
    info('Building phylogeny tree using prank...')
    prank_out = join(run.work_dir_path(),
                     splitext(basename(run.fasta_file_path()))[0])
    call_process.run('prank -d=' + run.fasta_file_path() + ' -o=' + prank_out +
                     ' -showtree')
    if not verify_file(prank_out + '.best.dnd'):
        critical('Prank failed to run')
    os.rename(prank_out + '.best.dnd', run.tree_file_path())
    os.remove(prank_out + '.best.fas')

    return run.fasta_file_path()
Exemplo n.º 24
0
def index_bam(bam_fpath, sambamba=None, samtools=None):
    sambamba = sambamba or get_executable()
    indexed_bam = bam_fpath + '.bai'
    if not can_reuse(indexed_bam, cmp_f=bam_fpath, silent=True):
        cmdline = '{sambamba} index {bam_fpath}'.format(**locals())
        res = run(cmdline, output_fpath=indexed_bam, stdout_to_outputfile=False, stdout_tx=False)
Exemplo n.º 25
0
def call_sambamba(cmdl, bam_fpath, output_fpath=None, command_name='', no_index=False):
    if not no_index:
        index_bam(bam_fpath)
    sambamba = get_executable()
    run(sambamba + ' ' + cmdl, output_fpath=output_fpath)
    return output_fpath
Exemplo n.º 26
0
def _vardict_pileup_sample(sample, work_dir, output_dir, genome_fasta_file,
                           snp_file):
    vardict_snp_vars = join(work_dir, sample.name + '_vars.txt')
    vcf_file = join(output_dir, sample.name + '.vcf')
    if can_reuse(vardict_snp_vars, [sample.bam, snp_file]) and can_reuse(
            vcf_file, vardict_snp_vars):
        return vcf_file

    vardict_exec = which('vardict')
    if not vardict_exec:
        critical(
            'Error: vardict is not in PATH. Please install it with `conda install -c bioconda vardict`'
        )
    vardict_bin_dir = dirname(vardict_exec)

    # Run VarDict
    index_bam(sample.bam)
    cmdl = '{vardict_exec} -G {genome_fasta_file} -N {sample.name} -b {sample.bam} -p -D {snp_file}'.format(
        **locals())
    call_process.run(cmdl, output_fpath=vardict_snp_vars)

    # Complex variants might have a shifted start positions with respect to rsid so we are
    # associating starts with rsid for futher snp identification
    ann_by_var = defaultdict(list)
    with open(vardict_snp_vars) as f:
        for l in f:
            fs = l.split('\t')
            ann, chrom, start = fs[1], fs[2], fs[3]
            ann_by_var[(chrom, start)] = ann

    info()
    info('Converting to VCF')
    work_vcf_file = join(work_dir, sample.name + '_vars.vcf')
    cmdl = (
        'cut -f-34 ' + vardict_snp_vars +
        ' | awk -F"\\t" -v OFS="\\t" \'{for (i=1;i<=NF;i++) { if ($i=="") $i="0" } print $0 }\''
        ' | ' + join('teststrandbias.R') + ' | ' + join('var2vcf_valid.pl') +
        ' -A -f 0.2' + '')
    call_process.run(cmdl, output_fpath=work_vcf_file)

    # Fix non-call records with empty REF and LAT, and "NA" values assigned to INFO's SN and HICOV
    fixed_vcf_file = add_suffix(work_vcf_file, 'fixed')
    info('Fixing VCF for parsing, writing to ' + fixed_vcf_file)
    with open(work_vcf_file) as inp, open(fixed_vcf_file, 'w') as out_f:
        for l in inp:
            if l.startswith('#'):
                out_f.write(l)
            else:
                fs = l.split('\t')
                chrom, pos, _, ref, alt = fs[0], int(
                    fs[1]), fs[2], fs[3], fs[4]
                if alt in ['.', '']:
                    fs[4] = fs[3] = _get_fasta_ref(
                        genome_fasta_file, chrom,
                        pos)  # Reading the reference allele from fasta
                l = '\t'.join(fs)
                l = l.replace('=NA;', '=.;')
                l = l.replace('=;', '=.;')
                l = l.replace('TYPE=0', 'TYPE=REF')
                out_f.write(l)
    assert verify_file(fixed_vcf_file)

    info('Annotating VCF with gene names and rsIDs')
    ann_vcf_file = add_suffix(fixed_vcf_file, 'ann')
    with open(fixed_vcf_file) as f, open(ann_vcf_file, 'w') as out:
        vcf_reader = vcf.Reader(f)
        vcf_writer = vcf.Writer(out, vcf_reader)
        for rec in vcf_reader:
            ann = ann_by_var[(rec.CHROM, str(rec.POS))]
            rec.ID = ann.split('|')[0]
            rec.INFO['ANNOTATION'] = ann
            vcf_writer.write_record(rec)
    assert verify_file(ann_vcf_file), ann_vcf_file

    ann_hdr_vcf_file = add_suffix(ann_vcf_file, 'hdr')
    cmdl = 'bcftools annotate -h <(echo ' \
           '\'##INFO=<ID=ANNOTATION,Number=1,Type=String,Description="rsid|gene_name|ref|alts">\') ' + \
           bgzip_and_tabix(ann_vcf_file)
    call_process.run(cmdl, output_fpath=ann_hdr_vcf_file)

    debug('Renaming ' + ann_hdr_vcf_file + ' -> ' + vcf_file)
    os.rename(ann_hdr_vcf_file, vcf_file)
    return vcf_file