示例#1
0
def sort_bed_gsort(input_bed_fpath,
                   output_bed_fpath=None,
                   work_dir=None,
                   fai_fpath=None,
                   genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()),
            output_fpath=tx)

    return output_bed_fpath
示例#2
0
def partition_gtf(gtf, coding=False, out_file=False):
    """
    return a GTF file of all non-coding or coding transcripts. the GTF must be annotated
    with gene_biotype = "protein_coding" or to have the source column set to the
    biotype for all coding transcripts. set coding to
    True to get only the coding, false to get only the non-coding
    """
    if out_file and file_exists(out_file):
        return out_file
    if not out_file:
        out_file = tempfile.NamedTemporaryFile(delete=False,
                                               suffix=".gtf").name

    if coding:
        pred = lambda biotype: biotype and biotype == "protein_coding"
    else:
        pred = lambda biotype: biotype and biotype != "protein_coding"

    biotype_lookup = _biotype_lookup_fn(gtf)

    db = get_gtf_db(gtf)
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            for feature in db.all_features():
                biotype = biotype_lookup(feature)
                if pred(biotype):
                    out_handle.write(str(feature) + "\n")
    return out_file
示例#3
0
def batch_callable_bed(bam_files, output_bed_file, work_dir, genome_fasta_file, min_depth,
                       parall_view=None):
    """ Picking random 3 samples and getting a callable for them.
        Trade off between looping through all samples in a huge batch,
        and hitting an sample with outstanding coverage.
    """
    if can_reuse(output_bed_file, bam_files):
        return output_bed_file

    work_dir = safe_mkdir(join(work_dir, 'callable_work'))
    # random.seed(1234)  # seeding random for reproducability
    # bam_files = random.sample(bam_files, min(len(bam_files), 3))

    if parall_view:
        callable_beds = parall_view.run(_calculate, [
            [bf, work_dir, genome_fasta_file, min_depth]
            for bf in bam_files])
    else:
        with parallel_view(len(bam_files), ParallelCfg(threads=len(bam_files)), work_dir) as parall_view:
            callable_beds = parall_view.run(_calculate, [
                [bf, work_dir, genome_fasta_file, min_depth]
                for bf in bam_files])

    good_overlap_sample_fraction = 0.8  # we want to pick those regions that have coverage at 80% of samples
    good_overlap_count = max(1, good_overlap_sample_fraction * len(callable_beds))
    info(f'Intersecting callable regions and picking good overlaps with >={good_overlap_count} '
         f'samples ({100 * good_overlap_sample_fraction}% of {len(callable_beds)})')
    with file_transaction(work_dir, output_bed_file) as tx:
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        intersection = pybedtools.BedTool() \
            .multi_intersect(i=callable_beds) \
            .filter(lambda r: len(r[4].split(',')) >= good_overlap_count)
        intersection.saveas(tx)
    info(f'Saved to {output_bed_file}')
    return output_bed_file
def main():
    options = [
        (['-g', '--genome'], dict(
            dest='genome',
            help='Genome build. Accepted values: ' + ', '.join(ba.SUPPORTED_GENOMES),
        )),
    ]
    parser = OptionParser()
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()

    if not opts.genome:
        critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)')
    genome = opts.genome

    debug('Getting features from storage')
    features_bed = ba.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES))

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] == 'CDS')
    features_bed = features_bed.filter(ba.get_only_canonical_filter(genome))

    info('Saving CDS regions...')
    output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed'))
    with file_transaction(None, output_fpath) as tx:
        features_bed.cut(range(6)).saveas(tx)
    info('Done, saved to ' + output_fpath)
示例#5
0
def filter_bed_with_gene_set(bed_fpath, gene_keys_set, output_fpath):
    with file_transaction(None, output_fpath) as tx:
        with open(bed_fpath) as inp, open(tx, 'w') as out:
            for l in inp:
                if l.strip('\n'):
                    chrom, start, end, gene = l.strip('\n').split('\t')
                    if (gene, chrom) in gene_keys_set:
                        out.write(l)
示例#6
0
def filter_bed_with_gene_set(bed_fpath, gene_keys_set, output_fpath):
    with file_transaction(None, output_fpath) as tx:
        with open(bed_fpath) as inp, open(tx, 'w') as out:
            for l in inp:
                if l.strip('\n'):
                    chrom, start, end, gene = l.strip('\n').split('\t')
                    if (gene, chrom) in gene_keys_set:
                        out.write(l)
示例#7
0
文件: sex.py 项目: vladsaveliev/Utils
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome + ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.')
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.')
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
             ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex
示例#8
0
def sample_callable_bed(bam_file, output_bed_file, work_dir, genome_fasta_file, min_depth):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    callable_bed = _calculate(bam_file, work_dir, genome_fasta_file, min_depth)
    if not can_reuse(output_bed_file, callable_bed):
        with file_transaction(work_dir, output_bed_file) as tx_out_file:
            callable_regions = pybedtools.BedTool(callable_bed).filter(lambda x: x.name == 'CALLABLE')
            callable_regions.saveas(tx_out_file)
    return output_bed_file
示例#9
0
def sort_bed(input_bed_fpath,
             output_bed_fpath=None,
             work_dir=None,
             fai_fpath=None,
             chr_order=None,
             genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    regions = []

    if not chr_order:
        if fai_fpath:
            fai_fpath = verify_file(fai_fpath)
        elif genome:
            fai_fpath = verify_file(ref.get_fai(genome))
        else:
            critical(
                'Either of chr_order, fai_fpath, or genome build name must be specified'
            )
        chr_order = get_chrom_order(fai_fpath=fai_fpath)

    with open(input_bed_fpath) as f:
        with file_transaction(work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(
                        Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    debug('Sorted ' + str(len(regions)) + ' regions, saved to ' +
          output_bed_fpath)
    return output_bed_fpath
示例#10
0
def run(cmd, output_fpath=None, input_fpaths=None, checks=None, stdout_to_outputfile=True,
        stdout_tx=True, reuse=False, env_vars=None):
    """Run the provided command, logging details and checking for errors.
    """
    if output_fpath and reuse:
        if verify_file(output_fpath, silent=True):
            info(output_fpath + ' exists, reusing')
            return output_fpath
        if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True):
            info(output_fpath + '.gz exists, reusing')
            return output_fpath

    if input_fpaths is not None:
        if isinstance(input_fpaths, str):
            input_fpaths = [input_fpaths]
        for fpath in input_fpaths:
            verify_file(fpath, is_critical=True)

    env = _get_env(env_vars)
    # info('env: ' + str(env))

    if checks is None:
        checks = [file_nonempty_check]

    def _try_run(_cmd, _output_fpath, _input_fpaths):
        try:
            info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, str) else _cmd)
            _do_run(_cmd, checks, env, _output_fpath, _input_fpaths)
        except:
            raise

    if output_fpath:
        if isfile(output_fpath):
            os.remove(output_fpath)
    if output_fpath:
        if stdout_tx:
            with file_transaction(None, output_fpath) as tx_out_file:
                if stdout_to_outputfile:
                    cmd += ' > ' + tx_out_file
                else:
                    cmd += '\n'
                    cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \
                             .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \
                             .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \
                             .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \
                             .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \
                             .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \
                             .replace('\n', '')
                _try_run(cmd, tx_out_file, input_fpaths)
        else:
            _try_run(cmd, output_fpath, input_fpaths)

    else:
        _try_run(cmd, None, input_fpaths)
示例#11
0
def merge_overlaps(work_dir, bed_fpath, distance=None):
    """Merge bed file intervals to avoid overlapping regions.
    Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes
    that don't collapse BEDs prior to using them.
    """
    output_fpath = intermediate_fname(work_dir, bed_fpath, 'merged')
    if isfile(output_fpath) and verify_file(output_fpath, cmp_f=bed_fpath):
        return output_fpath

    with file_transaction(work_dir, output_fpath) as tx:
        kwargs = dict(d=distance) if distance else dict()
        BedTool(bed_fpath).merge(**kwargs).saveas(tx)
    return output_fpath
示例#12
0
def clean_bed(bed_fpath, work_dir):
    clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')

    if not can_reuse(clean_fpath, bed_fpath):
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        bed = BedTool(bed_fpath)
        bed = bed.filter(lambda x: x.chrom and
                         not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(work_dir, clean_fpath) as tx_out_file:
            bed.saveas(tx_out_file)
        verify_bed(clean_fpath, is_critical=True)
        debug('Saved clean BED file into ' + clean_fpath)
    return clean_fpath
示例#13
0
def merge_overlaps(work_dir, bed_fpath, distance=None):
    """Merge bed file intervals to avoid overlapping regions.
    Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes
    that don't collapse BEDs prior to using them.
    """
    output_fpath = intermediate_fname(work_dir, bed_fpath, 'merged')
    if isfile(output_fpath) and verify_file(output_fpath, cmp_f=bed_fpath):
        return output_fpath

    with file_transaction(work_dir, output_fpath) as tx:
        import pybedtools
        kwargs = dict(d=distance) if distance else dict()
        pybedtools.BedTool(bed_fpath).merge(**kwargs).saveas(tx)
    return output_fpath
示例#14
0
def tx2genefile(gtf, out_file=None):
    """
    write out a file of transcript->gene mappings.
    use the installed tx2gene.csv if it exists, else write a new one out
    """
    installed_tx2gene = os.path.join(os.path.dirname(gtf), "tx2gene.csv")
    if file_exists(installed_tx2gene):
        return installed_tx2gene
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            for k, v in transcript_to_gene(gtf).items():
                out_handle.write(",".join([k, v]) + "\n")
    return out_file
示例#15
0
def clean_bed(bed_fpath, work_dir):
    clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')

    if not can_reuse(clean_fpath, bed_fpath):
        import pybedtools
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        bed = pybedtools.BedTool(bed_fpath)
        bed = bed.filter(lambda x: x.chrom and not any(
            x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(work_dir, clean_fpath) as tx_out_file:
            bed.saveas(tx_out_file)
        verify_bed(clean_fpath, is_critical=True)
        debug('Saved clean BED file into ' + clean_fpath)
    return clean_fpath
示例#16
0
def run(cmd, output_fpath=None, input_fpath=None, checks=None, stdout_to_outputfile=True,
        stdout_tx=True, reuse=False, env_vars=None):
    """Run the provided command, logging details and checking for errors.
    """
    if output_fpath and reuse:
        if verify_file(output_fpath, silent=True):
            info(output_fpath + ' exists, reusing')
            return output_fpath
        if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True):
            info(output_fpath + '.gz exists, reusing')
            return output_fpath

    env = _get_env(env_vars)

    if checks is None:
        checks = [file_nonempty_check]

    def _try_run(_cmd, _output_fpath, _input_fpath):
        try:
            info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, str) else _cmd)
            _do_run(_cmd, checks, env, _output_fpath, _input_fpath)
        except:
            raise

    if output_fpath:
        if isfile(output_fpath):
            os.remove(output_fpath)
    if output_fpath:
        if stdout_tx:
            with file_transaction(None, output_fpath) as tx_out_file:
                if stdout_to_outputfile:
                    cmd += ' > ' + tx_out_file
                else:
                    cmd += '\n'
                    cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \
                             .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \
                             .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \
                             .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \
                             .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \
                             .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \
                             .replace('\n', '')
                _try_run(cmd, tx_out_file, input_fpath)
        else:
            _try_run(cmd, output_fpath, input_fpath)

    else:
        _try_run(cmd, None, input_fpath)
示例#17
0
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    regions = []

    if not chr_order:
        if fai_fpath:
            fai_fpath = verify_file(fai_fpath)
        elif genome:
            fai_fpath = verify_file(ref.get_fai(genome))
        else:
            critical('Either of chr_order, fai_fpath, or genome build name must be specified')
        chr_order = get_chrom_order(fai_fpath=fai_fpath)

    with open(input_bed_fpath) as f:
        with file_transaction(work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath)
    return output_bed_fpath
示例#18
0
def main():
    options = [
        (['-g', '--genome'],
         dict(
             dest='genome',
             help='Genome build. Accepted values: ' +
             ', '.join(ebl.SUPPORTED_GENOMES),
         )),
        (['-c', '--canonical'],
         dict(
             dest='canonical',
             action='store_true',
             help='Use canonical only',
         )),
    ]
    parser = OptionParser()
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()

    if not opts.genome:
        logger.critical(
            'Error: please, specify genome build name with -g (e.g. `-g hg19`)'
        )
    genome = opts.genome

    logger.debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        logger.critical('Genome ' + genome + ' is not supported. Supported: ' +
                        ', '.join(ebl.SUPPORTED_GENOMES))

    logger.warn('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(
        lambda x: x[ebl.BedCols.FEATURE] == 'CDS')
    if opts.canonical:
        features_bed = features_bed.filter(
            ebl.get_only_canonical_filter(genome))

    logger.warn('Saving CDS regions...')
    output_fpath = adjust_path(
        join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed'))
    with file_transaction(None, output_fpath) as tx:
        features_bed.cut(range(6)).saveas(tx)
    logger.warn('Done, saved to ' + output_fpath)
示例#19
0
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx)

    return output_bed_fpath
示例#20
0
def _calculate(bam_file, work_dir, genome_fasta_file, min_depth):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    output_prefix = os.path.join(work_dir, bam_samplename(bam_file))
    callability_annotation_file = output_prefix + '.callable.bed'
    if not can_reuse(callability_annotation_file, bam_file):
        info(f'Calculating coverage at {bam_file}')
        run(f'goleft depth --q 1 --mincov {min_depth} --reference {genome_fasta_file} --ordered'
            f' --prefix {output_prefix} {bam_file}')

    callable_file = output_prefix + '.callable.CALLABLE.bed'
    if not can_reuse(callable_file, callability_annotation_file):
        with file_transaction(None, callable_file) as tx:
            pybedtools.BedTool(callability_annotation_file)\
                .filter(lambda x: x.name == 'CALLABLE')\
                .saveas(tx)

    return callable_file
示例#21
0
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome +
             ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug(
                'The male non-PAR region does not overlap with the capture target - cannot determine sex.'
            )
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info(
        'Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.'
    )
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' +
              str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
              ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' +
                  str(FEMALE_Y_COVERAGE_FACTOR) +
                  ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' +
                  str(FEMALE_Y_COVERAGE_FACTOR) +
                  ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex
示例#22
0
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None,
             reannotate=True, high_confidence=False, only_canonical=False,
             coding_only=False, short=False, extended=False, is_debug=False, **kwargs):

    debug('Getting features from storage')
    features_bed = ba.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES))

    if genome:
        fai_fpath = reference_data.get_fai(genome)
        chr_order = reference_data.get_chrom_order(genome)
    else:
        fai_fpath = None
        chr_order = bed_chrom_order(input_bed_fpath)

    input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome)

    ori_bed = BedTool(input_bed_fpath)
    ori_col_num = ori_bed.field_count()
    reannotate = reannotate or ori_col_num == 3
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    ori_bed = BedTool(input_bed_fpath)

    if high_confidence:
        features_bed = features_bed.filter(ba.high_confidence_filter)
    if only_canonical:
        features_bed = features_bed.filter(ba.get_only_canonical_filter(genome))
    if coding_only:
        features_bed = features_bed.filter(ba.protein_coding_filter)
    # unique_tx_by_gene = find_best_tx_by_gene(features_bed)

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x:
                                       x[ba.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript'])
        # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]])

    info('Overlapping regions with Ensembl data')
    if is_debug:
        ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed'))
        debug(f'Saved regions to {ori_bed.fn}')
        features_bed = features_bed.saveas(join(work_dir, 'features.bed'))
        debug(f'Saved features to {features_bed.fn}')
    annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num,
                          high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs)

    full_header = [ba.BedCols.names[i] for i in ba.BedCols.cols]
    add_ori_extra_fields = ori_col_num > 3
    if not reannotate and ori_col_num == 4:
        add_ori_extra_fields = False  # no need to report the original gene field if we are not re-annotating

    info('Saving annotated regions...')
    total = 0
    with file_transaction(work_dir, output_fpath) as tx:
        with open(tx, 'w') as out:
            header = full_header[:6]
            if short:
                header = full_header[:4]
            if extended:
                header = full_header[:-1]
            if add_ori_extra_fields:
                header.append(full_header[-1])

            if extended:
                out.write('## ' + ba.BedCols.names[ba.BedCols.TX_OVERLAP_PERCENTAGE] +
                          ': part of region overlapping with transcripts\n')
                out.write('## ' + ba.BedCols.names[ba.BedCols.EXON_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with exons\n')
                out.write('## ' + ba.BedCols.names[ba.BedCols.CDS_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with protein coding regions\n')
                out.write('\t'.join(header) + '\n')
            for full_fields in annotated:
                fields = full_fields[:6]
                if short:
                    fields = full_fields[:4]
                if extended:
                    fields = full_fields[:-1]
                if add_ori_extra_fields:
                    fields.append(full_fields[-1])

                out.write('\t'.join(map(_format_field, fields)) + '\n')
                total += 1
    
    debug('Saved ' + str(total) + ' total annotated regions')
    return output_fpath
示例#23
0
def _make_snp_file(dbsnp_snps_file, genome_build, output_file,
                   autosomal_locations_limit=175, min_snp_amount=30):
    if can_reuse(output_file, dbsnp_snps_file):
        return output_file

    locs_by_gene = defaultdict(list)
    total_locs = 0
    for i, interval in enumerate(BedTool(dbsnp_snps_file)):
        if is_sex_chrom(interval.chrom):
            continue
        pos = int(interval.start) + 1
        annots = interval.name.split('|')
        # if len(annots) == 2:
        #     rsid, gene = interval.name.split('|')
        #     ref = interval[4]
        # else:
        rsid, gene, ref, alts = interval.name.split('|')
        loc = (interval.chrom, pos, rsid, gene, ref, alts)
        locs_by_gene[gene].append(loc)
        total_locs += 1

    random.seed(1234)  # seeding random for reproducability

    # Selecting random genes
    gnames = random.sample(locs_by_gene.keys(), min(len(locs_by_gene), autosomal_locations_limit))
    locs_by_gene = {g: locs_by_gene[g] for g in gnames}
    # Selecting random SNPs in each gene
    # min_locs_per_gene = min(len(locs) for locs in locs_by_gene.values())
    # if pick_unclustered:
    #     locs_per_gene = min(autosomal_locations_limit / len(gnames), min_locs_per_gene)
    #     while locs_per_gene * len(gnames) < min_snp_amount:
    #         locs_per_gene = math.ceil(float(min_snp_amount) / len(gnames))
    #     selected_locs_by_gene = {g: random.sample(locs_by_gene[g], locs_per_gene) for g in gnames}
    #     selected_locs = [l for gene_locs in selected_locs_by_gene.values() for l in gene_locs]
    # else:
    all_locs = [l for gene_locs in locs_by_gene.values() for l in gene_locs]

    # Selecting unclustered SNPs within genes
    non_clustered_locs = []
    prev_pos = 0
    for (chrom, pos, rsid, gene, ref, alts) in all_locs:
        if 0 < pos - prev_pos < 500:
            continue
        else:
            prev_pos = pos
            non_clustered_locs.append((chrom, pos, rsid, gene, ref, alts))

    # Selecting random SNPs within the limit
    selected_locs = random.sample(non_clustered_locs, min(len(non_clustered_locs), autosomal_locations_limit))

    # Sorting final locations
    chrom_order = get_chrom_order(genome_build)
    selected_locs.sort(key=lambda a: (chrom_order.get(a[0], -1), a[1:]))

    log.debug('Selected the following autosomal SNPs:')
    for (chrom, pos, rsid, gene, ref, alts) in selected_locs:
        log.debug('  ' + chrom + ':' + str(pos) + '\t' + rsid + '\t' + gene + '\t' + ref + '>' + ','.join(alts))

    with file_transaction(None, output_file) as tx:
        with open(tx, 'w') as out:
            for (chrom, pos, rsid, gene, ref, alts) in selected_locs:
                out.write('\t'.join([chrom, str(pos-1), str(pos), rsid + '|' + gene + '|' + ref + '|' + alts]) + '\n')
    return output_file