예제 #1
0
def get_chrom_lengths(genome=None, fai_fpath=None):
    assert genome or fai_fpath, 'One of genome or fai_fpath should be not None: ' \
                                'genome=' + str(genome) + ' fai_fpath=' + str(fai_fpath)

    if not fai_fpath:
        check_genome(genome)
        fai_fpath = get_fai(genome)
    else:
        fai_fpath = verify_file(fai_fpath, is_critical=True)
        if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'):
            critical('Error: .fai or .fa is accepted.')

    chr_lengths = []

    if fai_fpath.endswith('.fa'):
        debug('Reading genome sequence (.fa) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            from Bio import SeqIO
            reference_records = SeqIO.parse(handle, 'fasta')
            for record in reference_records:
                chrom = record.id
                chr_lengths.append((chrom, len(record.seq)))

    else:
        debug('Reading genome index file (.fai) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            for line in handle:
                line = line.strip()
                if line:
                    chrom, length = line.split()[0], int(line.split()[1])
                    chr_lengths.append((chrom, length))

    return chr_lengths
예제 #2
0
def sort_bed_gsort(input_bed_fpath,
                   output_bed_fpath=None,
                   work_dir=None,
                   fai_fpath=None,
                   genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()),
            output_fpath=tx)

    return output_bed_fpath
예제 #3
0
def read_samples(args):
    bam_by_sample = find_bams(args)
    if bam_by_sample:
        info('Found ' + str(len(bam_by_sample)) + ' BAM file' +
             ('s' if len(bam_by_sample) > 1 else ''))

    input_not_bam = [
        verify_file(fpath) for fpath in args
        if adjust_path(fpath) not in bam_by_sample
    ]
    input_not_bam = [fpath for fpath in input_not_bam if fpath]
    fastqs_by_sample = dict()
    if not input_not_bam and not bam_by_sample:
        critical('No correct input files')
    if input_not_bam:
        info('Input ' + str(len(input_not_bam)) +
             ' correct input non-BAM files')
        fastqs_by_sample = find_fastq_pairs(input_not_bam)
        if fastqs_by_sample:
            info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs')
        intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys())
        if intersection:
            critical('The following samples both had input BAMs and FastQ: ' +
                     ', '.join(list(intersection)))

    return fastqs_by_sample, bam_by_sample
예제 #4
0
def tx_tmpdir(base_dir, rollback_dirpath):
    """Context manager to create and remove a transactional temporary directory.
    """
    # tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4()))
    # unique_attempts = 0
    # while os.path.exists(tmp_dir_base):
    #     if unique_attempts > 5:
    #         break
    #     tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4()))
    #     time.sleep(1)
    #     unique_attempts += 1

    # if base_dir is not None:
    #     tmp_dir_base = os.path.join(base_dir, "tx")
    # else:
    #     tmp_dir_base = os.path.join(os.getcwd(), "tx")
    if exists(rollback_dirpath):
        critical(rollback_dirpath + ' already exists')

    tmp_dir = tempfile.mkdtemp(dir=base_dir)
    safe_mkdir(tmp_dir)
    try:
        yield tmp_dir
    finally:
        if tmp_dir and exists(tmp_dir):
            os.rename(tmp_dir, rollback_dirpath)
예제 #5
0
def main():
    options = [
        (['-g', '--genome'], dict(
            dest='genome',
            help='Genome build. Accepted values: ' + ', '.join(ebl.SUPPORTED_GENOMES),
        )),
    ]
    parser = OptionParser()
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()

    if not opts.genome:
        critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)')
    genome = opts.genome

    debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES))

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x: x[ebl.BedCols.FEATURE] == 'CDS')
    features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome))

    info('Saving CDS regions...')
    output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed'))
    with file_transaction(None, output_fpath) as tx:
        features_bed.cut(range(6)).saveas(tx)
    info('Done, saved to ' + output_fpath)
예제 #6
0
def tx_tmpdir(base_dir, rollback_dirpath):
    """Context manager to create and remove a transactional temporary directory.
    """
    # tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4()))
    # unique_attempts = 0
    # while os.path.exists(tmp_dir_base):
    #     if unique_attempts > 5:
    #         break
    #     tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4()))
    #     time.sleep(1)
    #     unique_attempts += 1

    # if base_dir is not None:
    #     tmp_dir_base = os.path.join(base_dir, "tx")
    # else:
    #     tmp_dir_base = os.path.join(os.getcwd(), "tx")
    if exists(rollback_dirpath):
        critical(rollback_dirpath + ' already exists')

    tmp_dir = tempfile.mkdtemp(dir=base_dir)
    safe_mkdir(tmp_dir)
    try:
        yield tmp_dir
    finally:
        if tmp_dir and exists(tmp_dir):
            os.rename(tmp_dir, rollback_dirpath)
예제 #7
0
def _get(relative_path, genome=None):
    """
    :param relative_path: relative path of the file inside the repository
    :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20,
                   in case of BED, the returning BedTool will be with added filter.
    :return: BedTools object if it's a BED file, or filepath
    """
    chrom = None
    if genome:
        if '-chr' in genome:
            genome, chrom = genome.split('-')
        check_genome(genome)
        relative_path = relative_path.format(genome=genome)

    path = abspath(join(dirname(__file__), relative_path))
    if not isfile(path) and isfile(path + '.gz'):
        path += '.gz'

    if path.endswith('.bed') or path.endswith('.bed.gz'):
        if path.endswith('.bed.gz'):
            bedtools = which('bedtools')
            if not bedtools:
                critical('bedtools not found in PATH: ' + str(os.environ['PATH']))
            debug('BED is compressed, creating BedTool')
            bed = BedTool(path)
        else:
            debug('BED is uncompressed, creating BedTool')
            bed = BedTool(path)

        if chrom:
            debug('Filtering BEDTool for chrom ' + chrom)
            bed = bed.filter(lambda r: r.chrom == chrom)
        return bed
    else:
        return path
예제 #8
0
def read_biomart(genome_name):
    features_by_ens_id = dict()
    bm_fpath = ebl.biomart_fpath(genome_name)
    if not verify_file(bm_fpath):
        warn('Warning: biomart file for genome ' + genome_name +
             ' not found, skip using the TSL values')
        return dict()

    with open(bm_fpath) as f:
        for r in csv.DictReader(f, delimiter='\t'):
            features_by_ens_id[r['Transcript ID']] = r

    # hg38 version has TSL, checking if we can populate some TSL from it
    if not genome_name.startswith('hg38'):
        bm_fpath = ebl.biomart_fpath('hg38')
        if not verify_file(bm_fpath):
            critical(
                'Biomart for hg38 file not found, and needed for TSL values')
        with open(bm_fpath) as f:
            for r in csv.DictReader(f, delimiter='\t'):
                if r['Transcript ID'] not in features_by_ens_id:
                    features_by_ens_id[r['Transcript ID']] = r
                else:
                    features_by_ens_id[r['Transcript ID']][
                        'Transcript Support Level (TSL)'] = r[
                            'Transcript Support Level (TSL)']
    return features_by_ens_id
예제 #9
0
def find_bams(args):
    bam_by_sample = OrderedDict()
    bad_bam_fpaths = []

    good_args = []
    for arg in args:
        # /ngs/oncology/Analysis/bioscience/Bio_0038_KudosCellLinesExomes/Bio_0038_150521_D00443_0159_AHK2KTADXX/bcbio,Kudos159 /ngs/oncology/Analysis/bioscience/Bio_0038_KudosCellLinesExomes/Bio_0038_150521_D00443_0160_BHKWMNADXX/bcbio,Kudos160
        fpath = arg.split(',')[0]
        fname, ext = splitext(fpath)
        if ext == '.bam':
            bam_fpath = verify_bam(fpath)
            if not bam_fpath:
                bad_bam_fpaths.append(fpath)
            else:
                if len(arg.split(',')) > 1:
                    sname = arg.split(',')[1]
                else:
                    sname = basename(splitext(bam_fpath)[0])
                bam_by_sample[sname] = bam_fpath
                good_args.append(arg)
    if bad_bam_fpaths:
        critical('BAM files cannot be found, empty or not BAMs: ' +
                 ', '.join(bad_bam_fpaths))
    for arg in good_args:
        args.remove(arg)

    return bam_by_sample
예제 #10
0
파일: sex.py 프로젝트: vladsaveliev/TargQC
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome + ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.')
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.')
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
             ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex
예제 #11
0
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
    try:
        dic = load_yaml(open(fpath))
    except Exception:
        err(format_exc())
        critical('Could not parse bcbio YAML ' + fpath)
    else:
        return dic
예제 #12
0
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
    try:
        dic = load_yaml(open(fpath))
    except Exception:
        err(format_exc())
        critical('Could not parse bcbio YAML ' + fpath)
    else:
        return dic
예제 #13
0
def sort_bed(input_bed_fpath,
             output_bed_fpath=None,
             work_dir=None,
             fai_fpath=None,
             chr_order=None,
             genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    regions = []

    if not chr_order:
        if fai_fpath:
            fai_fpath = verify_file(fai_fpath)
        elif genome:
            fai_fpath = verify_file(ref.get_fai(genome))
        else:
            critical(
                'Either of chr_order, fai_fpath, or genome build name must be specified'
            )
        chr_order = get_chrom_order(fai_fpath=fai_fpath)

    with open(input_bed_fpath) as f:
        with file_transaction(work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(
                        Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    debug('Sorted ' + str(len(regions)) + ' regions, saved to ' +
          output_bed_fpath)
    return output_bed_fpath
예제 #14
0
def run_qualimap(work_dir,
                 output_dir,
                 output_fpaths,
                 bam_fpath,
                 genome,
                 bed_fpath=None,
                 threads=1):
    info('Analysing ' + bam_fpath)

    safe_mkdir(dirname(output_dir))
    safe_mkdir(output_dir)

    mem_cmdl = ''
    mem_m = get_qualimap_max_mem(bam_fpath)
    mem = str(int(mem_m)) + 'M'
    mem_cmdl = '--java-mem-size=' + mem

    cmdline = (find_executable() +
               ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 '
               '-bam {bam_fpath} -outdir {output_dir}')

    if genome.startswith('hg') or genome.startswith('GRCh'):
        cmdline += ' -gd HUMAN'
    if genome.startswith('mm'):
        cmdline += ' -gd MOUSE'

    if bed_fpath:
        cmdline += ' -gff {bed_fpath}'
        debug('Using amplicons/capture panel ' + bed_fpath)

    cmdline = cmdline.format(**locals())
    if not all(
            can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath])
            for fp in output_fpaths):
        for fp in output_fpaths:
            if isfile(fp):
                os.remove(fp)
        try:
            run(cmdline, env_vars=dict(DISPLAY=None))
        except subprocess.CalledProcessError as e:
            if 'The alignment file is unsorted.' in e.output:
                info()
                info('BAM file is unsorted; trying to sort and rerun QualiMap')
                sorted_bam_fpath = sort_bam(bam_fpath)
                cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath)
                run(cmdline, env_vars=dict(DISPLAY=None))

    if not all(
            verify_file(
                fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath])
            for fp in output_fpaths):
        critical('Some of the QualiMap results were not generated')

    return output_dir
예제 #15
0
def find_fastq_pairs(fpaths):
    info('Finding FastQ pairs...')
    fastqs_by_sample_name = dict()
    for fpath in fpaths:
        fn, ext = splitext_plus(basename(fpath))
        if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']:
            sname, l_fpath, r_fpath = None, None, None
            if fn.endswith('_1'):
                sname = fn[:-2]
                l_fpath = fpath
            if fn.endswith('_R1'):
                sname = fn[:-3]
                l_fpath = fpath
            if fn.endswith('_2'):
                sname = fn[:-2]
                r_fpath = fpath
            if fn.endswith('_R2'):
                sname = fn[:-3]
                r_fpath = fpath

            if sname:
                m = re.match(r'(.*)_S\d+', sname)
                if m:
                    sname = m.group(1)
                sname = sname.replace('-', '_')
            else:
                sname = fn
                info('Cannot detect file for ' + sname)

            l, r = fastqs_by_sample_name.get(sname, (None, None))
            if l and l_fpath:
                critical('Duplicated left FastQ files for ' + sname + ': ' +
                         l + ' and ' + l_fpath)
            if r and r_fpath:
                critical('Duplicated right FastQ files for ' + sname + ': ' +
                         r + ' and ' + r_fpath)
            fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath

    fixed_fastqs_by_sample_name = dict()
    for sname, (l, r) in fastqs_by_sample_name.items():
        if not l:
            err('ERROR: for sample ' + sname + ', left reads not found')
        if not r:
            err('ERROR: for sample ' + sname + ', right reads not found')
        if l and r:
            fixed_fastqs_by_sample_name[sname] = l, r

    return fixed_fastqs_by_sample_name
예제 #16
0
def annotate_target(work_dir, target_bed, genome_build):
    output_fpath = intermediate_fname(work_dir, target_bed, 'ann')
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    annotate_bed_py = which('annotate_bed.py')
    if not annotate_bed_py:
        critical(
            'Error: annotate_bed.py not found in PATH, please install TargQC.')

    cmdline = '{annotate_bed_py} {target_bed} -g {genome_build} -o {output_fpath}'.format(
        **locals())
    run(cmdline, output_fpath, stdout_to_outputfile=False)
    output_fpath = clean_bed(output_fpath, work_dir)
    return output_fpath
예제 #17
0
def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1):
    info('Analysing ' + bam_fpath)

    safe_mkdir(dirname(output_dir))
    safe_mkdir(output_dir)

    mem_cmdl = ''
    mem_m = get_qualimap_max_mem(bam_fpath)
    mem = str(int(mem_m)) + 'M'
    mem_cmdl = '--java-mem-size=' + mem

    cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 '
        '-bam {bam_fpath} -outdir {output_dir}')

    if genome.startswith('hg') or genome.startswith('GRCh'):
        cmdline += ' -gd HUMAN'
    if genome.startswith('mm'):
        cmdline += ' -gd MOUSE'

    if bed_fpath:
        cmdline += ' -gff {bed_fpath}'
        debug('Using amplicons/capture panel ' + bed_fpath)

    cmdline = cmdline.format(**locals())
    if not all(can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths):
        for fp in output_fpaths:
            if isfile(fp):
                os.remove(fp)
        try:
            run(cmdline, env_vars=dict(DISPLAY=None))
        except subprocess.CalledProcessError as e:
            if 'The alignment file is unsorted.' in e.output:
                info()
                info('BAM file is unsorted; trying to sort and rerun QualiMap')
                sorted_bam_fpath = sort_bam(bam_fpath)
                cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath)
                run(cmdline, env_vars=dict(DISPLAY=None))

    if not all(verify_file(fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths):
        critical('Some of the QualiMap results were not generated')

    return output_dir
예제 #18
0
def calc_bases_within_threshs(bases_by_depth, total_size, depth_thresholds):
    bases_within_threshs = OrderedDict(
        (depth, 0) for depth in depth_thresholds)
    rates_within_threshs = OrderedDict(
        (depth, None) for depth in depth_thresholds)

    for depth, bases in bases_by_depth.items():
        for t in depth_thresholds:
            if depth >= t:
                bases_within_threshs[t] += bases
    for t in depth_thresholds:
        bs = bases_within_threshs[t]
        if total_size > 0:
            rate = 1.0 * bases_within_threshs[t] / total_size
            if rate > 1:
                critical('Error: rate is > 1: rate = ' + str(rate) +
                         ', bases = ' + str(bs) + ', size = ' +
                         str(total_size))
            rates_within_threshs[t] = rate

    return bases_within_threshs, rates_within_threshs
예제 #19
0
def read_biomart(genome_name):
    features_by_ens_id = dict()
    bm_fpath = ebl.biomart_fpath(genome_name)
    if not verify_file(bm_fpath):
        warn('Warning: biomart file for genome ' + genome_name + ' not found, skip using the TSL values')
        return dict()
    
    with open(bm_fpath) as f:
        for r in csv.DictReader(f, delimiter='\t'):
            features_by_ens_id[r['Transcript ID']] = r
    
    # hg38 version has TSL, checking if we can populate some TSL from it
    if not genome_name.startswith('hg38'):
        bm_fpath = ebl.biomart_fpath('hg38')
        if not verify_file(bm_fpath): critical('Biomart for hg38 file not found, and needed for TSL values')
        with open(bm_fpath) as f:
            for r in csv.DictReader(f, delimiter='\t'):
                if r['Transcript ID'] not in features_by_ens_id:
                    features_by_ens_id[r['Transcript ID']] = r
                else:
                    features_by_ens_id[r['Transcript ID']]['Transcript Support Level (TSL)'] = r[
                        'Transcript Support Level (TSL)']
    return features_by_ens_id
예제 #20
0
def safe_mkdir(dirpath, descriptive_name=''):
    if isdir(dirpath):
        return dirpath

    if not dirpath:
        critical(descriptive_name + ' path is empty.')

    if isfile(dirpath):
        critical(descriptive_name + ' ' + dirpath + ' is a file.')

    num_tries = 0
    max_tries = 10

    while not exists(dirpath):
        # we could get an error here if multiple processes are creating
        # the directory at the same time. Grr, concurrency.
        try:
            os.makedirs(dirpath)
        except OSError as e:
            if num_tries > max_tries:
                raise
            num_tries += 1
            time.sleep(2)
    return dirpath
예제 #21
0
def safe_mkdir(dirpath, descriptive_name=''):
    if isdir(dirpath):
        return dirpath

    if not dirpath:
        critical(descriptive_name + ' path is empty.')

    if isfile(dirpath):
        critical(descriptive_name + ' ' + dirpath + ' is a file.')

    num_tries = 0
    max_tries = 10

    while not exists(dirpath):
        # we could get an error here if multiple processes are creating
        # the directory at the same time. Grr, concurrency.
        try:
            os.makedirs(dirpath)
        except OSError as e:
            if num_tries > max_tries:
                raise
            num_tries += 1
            time.sleep(2)
    return dirpath
예제 #22
0
def find_executable():
    executable = which('qualimap')
    if not executable:
        critical('Error: "qualimap" executable is not found in PATH')
    return executable
예제 #23
0
def get_executable():
    sys_path = which('sambamba')
    if not sys_path:
        critical('Error: sambamba executable is not found')
    return sys_path
예제 #24
0
def _log(msg, silent, is_critical):
    if is_critical:
        critical(msg)
    if not silent:
        warn(msg)
예제 #25
0
def start_targqc(
    work_dir,
    output_dir,
    samples,
    target_bed_fpath,
    parallel_cfg,
    bwa_prefix,
    fai_fpath=None,
    genome=config.genome,
    depth_threshs=config.depth_thresholds,
    downsample_to=config.downsample_fraction,
    padding=config.padding,
    dedup=config.dedup,
    num_pairs_by_sample=None,
    reannotate=config.reannotate,
):
    d = get_description()
    info('*' * len(d))
    info(d)
    info('*' * len(d))
    info()

    fai_fpath = fai_fpath or ref.get_fai(genome)
    target = Target(work_dir,
                    output_dir,
                    fai_fpath,
                    padding=padding,
                    bed_fpath=target_bed_fpath,
                    reannotate=reannotate,
                    genome=genome,
                    is_debug=logger.is_debug)

    fastq_samples = [
        s for s in samples if not s.bam and s.l_fpath and s.r_fpath
    ]
    from targqc.utilz.parallel import parallel_view
    if fastq_samples:
        if not bwa_prefix:
            critical('--bwa-prefix is required when running from fastq')
        with parallel_view(len(fastq_samples), parallel_cfg,
                           join(work_dir, 'sge_fastq')) as view:
            num_pairs_by_sample = proc_fastq(fastq_samples,
                                             view,
                                             work_dir,
                                             bwa_prefix,
                                             downsample_to,
                                             num_pairs_by_sample,
                                             dedup=dedup)

    info()
    for s in samples:
        if s.bam:
            info(s.name + ': using alignment ' + s.bam)

    with parallel_view(len(samples), parallel_cfg, join(work_dir,
                                                        'sge_bam')) as view:
        info('Sorting BAMs...')
        sorted_bams = view.run(
            sort_bam,
            [[s.bam, safe_mkdir(join(work_dir, s.name))] for s in samples])
        for s, sorted_bam in zip(samples, sorted_bams):
            s.bam = sorted_bam

        if all(can_reuse(s.bam + '.bai', s.bam) for s in samples):
            debug('BAM indexes exists')
        else:
            info('Indexing BAMs...')
            view.run(index_bam, [[s.bam] for s in samples])

        info('Making general reports...')
        make_general_reports(view,
                             samples,
                             target,
                             genome,
                             depth_threshs,
                             padding,
                             num_pairs_by_sample,
                             is_debug=logger.is_debug,
                             reannotate=reannotate,
                             fai_fpath=fai_fpath)

    info()
    info('*' * 70)
    tsv_fpath, html_fpath = make_tarqc_html_report(output_dir,
                                                   work_dir,
                                                   samples,
                                                   bed_fpath=target_bed_fpath)
    info('TargQC summary saved in: ')
    info('  ' + html_fpath)
    info('  ' + tsv_fpath)

    info()
    with parallel_view(len(samples), parallel_cfg, join(work_dir,
                                                        'sge_bam')) as view:
        info('Making region-level reports...')
        make_region_reports(view, work_dir, samples, target, genome,
                            depth_threshs)

    info()
    info('*' * 70)
    tsv_region_rep_fpath = combined_regional_reports(work_dir, output_dir,
                                                     samples)

    info()
    info('*' * 70)
    info('TargQC summary saved in: ')
    info('  ' + html_fpath)
    info('  ' + tsv_fpath)
    info('Per-region coverage statistics saved into:')
    info('  ' + tsv_region_rep_fpath)

    return html_fpath
예제 #26
0
def proc_fastq(samples, parall_view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample=None, dedup=True):
    num_pairs_by_sample = num_pairs_by_sample or dict()
    if downsample_to:
        # Read pairs counts
        debug()
        if all(s.name in num_pairs_by_sample for s in samples):
            debug('Using read pairs counts extracted from FastQC reports')
        elif all(can_reuse(make_pair_counts_fpath(join(work_dir, s.name)), s.l_fpath) for s in samples):
            debug('Reusing pairs counts, reading from files')
            num_pairs_by_sample = {s.name: int(open(make_pair_counts_fpath(join(work_dir, s.name))).read().strip()) for s in samples}
        else:
            info('Counting read pairs')
            num_pairs = parall_view.run(count_read_pairs, [[s.name, safe_mkdir(join(work_dir, s.name)), s.l_fpath] for s in samples])
            num_pairs_by_sample = {s.name: pairs_count for s, pairs_count in zip(samples, num_pairs)}

        # Downsampling
        debug()
        if all(can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.l_fpath), s.l_fpath) and
               can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.r_fpath), s.r_fpath) for s in samples):
            debug('Reusing downsampled FastQ')
            for s in samples:
                s.l_fpath = make_downsampled_fpath(join(work_dir, s.name), s.l_fpath)
                s.r_fpath = make_downsampled_fpath(join(work_dir, s.name), s.r_fpath)
        else:
            if isinstance(downsample_to, float):
                info('Downsampling FastQ to ' + str(float(downsample_to)) + ' fraction of reads')
            else:
                info('Downsampling FastQ to ' + str(int(downsample_to)) + ' read pairs')
            fastq_pairs = parall_view.run(downsample,
                [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, downsample_to, num_pairs_by_sample.get(s.name)]
                 for s in samples])
            for s, (l_r, r_r) in zip(samples, fastq_pairs):
                s.l_fpath = l_r
                s.r_fpath = r_r
    else:
        info('Skipping downsampling')

    debug()
    if all(can_reuse(make_bam_fpath(join(work_dir, s.name)), [s.l_fpath, s.r_fpath]) for s in samples):
        debug('All downsampled BAM exists, reusing')
        for s in samples:
            s.bam = make_bam_fpath(join(work_dir, s.name))
    else:
        bwa = which('bwa')
        if not isfile(bwa):
            critical('BWA not found under ' + bwa)
        smb = sambamba.get_executable()
        if not (bwa and smb):
            if not bwa:         err('Error: bwa is required for the alignment pipeline')
            if not smb:         err('Error: sambamba is required for the alignment pipeline')
            critical('Tools required for alignment not found')
        info('Aligning reads to the reference')
        bam_fpaths = parall_view.run(align,
            [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb, bwa_prefix, dedup, parall_view.cores_per_job]
             for s in samples])

        bam_fpaths = [verify_bam(b) for b in bam_fpaths]
        if len(bam_fpaths) < len(samples):
            critical('Some samples were not aligned successfully.')
        for bam, s in zip(bam_fpaths, samples):
            s.bam = bam

    return num_pairs_by_sample
예제 #27
0
def _annotate(bed, ref_bed, chr_order, fai_fpath, work_dir, ori_col_num,
              high_confidence=False, reannotate=False, is_debug=False, **kwargs):
    # if genome:
        # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2'))
        # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')')
        # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0])
    # else:

    intersection_bed = None
    intersection_fpath = None
    
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    if is_debug:
        intersection_fpath = join(work_dir, 'intersection.bed')
        if isfile(intersection_fpath):
            info('Loading from ' + intersection_fpath)
            intersection_bed = BedTool(intersection_fpath)
    if not intersection_bed:
        if count_bed_cols(fai_fpath) == 2:
            debug('Fai fields size is 2 ' + fai_fpath)
            intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath)
        else:
            debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) + ', not 2')
            intersection_bed = bed.intersect(ref_bed, wao=True)
    if is_debug and not isfile(intersection_fpath):
        intersection_bed.saveas(intersection_fpath)
        debug('Saved intersection to ' + intersection_fpath)

    total_annotated = 0
    total_uniq_annotated = 0
    total_off_target = 0

    met = set()

    overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict(lambda: OrderedDefaultDict(lambda: defaultdict(list)))
    # off_targets = list()

    expected_fields_num = ori_col_num + len(ebl.BedCols.cols[:-4]) + 1
    for i, intersection_fields in enumerate(intersection_bed):
        inters_fields_list = list(intersection_fields)
        if len(inters_fields_list) < expected_fields_num:
            critical(f'Cannot parse the reference BED file - unexpected number of lines '
                     '({len(inters_fields_list} in {inters_fields_list}' +
                     ' (less than {expected_fields_num})')

        a_chr, a_start, a_end = intersection_fields[:3]
        a_extra_columns = intersection_fields[3:ori_col_num]

        overlap_fields = [None for _ in ebl.BedCols.cols]

        overlap_fields[:len(intersection_fields[ori_col_num:-1])] = intersection_fields[ori_col_num:-1]
        keep_gene_column = not reannotate
        a_gene = None
        if keep_gene_column:
            a_gene = a_extra_columns[0]

        e_chr = overlap_fields[0]
        overlap_size = int(intersection_fields[-1])
        assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}'

        # fs = [None for _ in ebl.BedCols.cols]
        # fs[:3] = [a_chr, a_start, a_end]
        reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns))

        if e_chr == '.':
            total_off_target += 1
            # off_targets.append(fs)
            overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list)

        else:
            # fs[3:-1] = db_feature_fields[3:-1]
            total_annotated += 1
            if (a_chr, a_start, a_end) not in met:
                total_uniq_annotated += 1
                met.add((a_chr, a_start, a_end))

            e_gene = overlap_fields[ebl.BedCols.GENE]
            if keep_gene_column and e_gene != a_gene:
                overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list)
            else:
                transcript_id = overlap_fields[ebl.BedCols.ENSEMBL_ID]
                overlaps_by_tx_by_gene_by_loc[reg][e_gene][transcript_id].append((overlap_fields, overlap_size))

    info('  Total annotated regions: ' + str(total_annotated))
    info('  Total unique annotated regions: ' + str(total_uniq_annotated))
    info('  Total off target regions: ' + str(total_off_target))
    info('Resolving ambiguities...')
    annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order, **kwargs)

    return annotated
예제 #28
0
def find_executable():
    executable = which('qualimap')
    if not executable:
        critical('Error: "qualimap" executable is not found in PATH')
    return executable
예제 #29
0
def determine_sex(work_dir, bam_fpath, ave_depth, genome, target_bed=None):
    info()
    info('Determining sex')

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome + ', cannot identify sex')
        return None

    male_area_size = male_bed.count()
    info('Male region total size: ' + str(male_area_size))

    if target_bed:
        male_bed = BedTool(target_bed).intersect(male_bed).merge()
        target_male_area_size = male_bed.count()
        if target_male_area_size < male_area_size * MALE_TARGET_REGIONS_FACTOR:
            info('Target male region total size is ' + str(target_male_area_size) + ', which is less than the ' +
                 'checked male regions size * ' + str(MALE_TARGET_REGIONS_FACTOR) +
                 ' (' + str(male_area_size * MALE_TARGET_REGIONS_FACTOR) + ') - cannot determine sex')
            return None
        else:
            info('Target male region total size is ' + str(target_male_area_size) + ', which is higher than the ' +
                 'checked male regions size * ' + str(MALE_TARGET_REGIONS_FACTOR) +
                 ' (' + str(male_area_size * MALE_TARGET_REGIONS_FACTOR) + '). ' +
                 'Determining sex based on coverage in those regions.')
    else:
        info('WGS, determining sex based on chrY key regions coverage.')

    info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.')
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_cov_output_fpath = sambamba_depth(work_dir, male_bed, bam_fpath, [])
    chry_mean_coverage = get_mean_cov(chry_cov_output_fpath)
    info('Y key regions average depth: ' + str(chry_mean_coverage))
    ave_depth = float(ave_depth)
    info('Sample average depth: ' + str(ave_depth))
    if ave_depth < AVE_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        info('Sample average depth is too low (less then ' + str(AVE_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
             ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        info('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = ave_depth / chry_mean_coverage
        info('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            info('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            info('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male')
            sex = 'M'
    info('Sex is ' + sex)
    info()
    return sex
예제 #30
0
def proc_fastq(samples,
               parall_view,
               work_dir,
               bwa_prefix,
               downsample_to,
               num_pairs_by_sample=None,
               dedup=True):
    num_pairs_by_sample = num_pairs_by_sample or dict()
    if downsample_to:
        # Read pairs counts
        debug()
        if all(s.name in num_pairs_by_sample for s in samples):
            debug('Using read pairs counts extracted from FastQC reports')
        elif all(
                can_reuse(make_pair_counts_fpath(join(work_dir, s.name)),
                          s.l_fpath) for s in samples):
            debug('Reusing pairs counts, reading from files')
            num_pairs_by_sample = {
                s.name: int(
                    open(make_pair_counts_fpath(join(work_dir,
                                                     s.name))).read().strip())
                for s in samples
            }
        else:
            info('Counting read pairs')
            num_pairs = parall_view.run(
                count_read_pairs,
                [[s.name,
                  safe_mkdir(join(work_dir, s.name)), s.l_fpath]
                 for s in samples])
            num_pairs_by_sample = {
                s.name: pairs_count
                for s, pairs_count in zip(samples, num_pairs)
            }

        # Downsampling
        debug()
        if all(
                can_reuse(
                    make_downsampled_fpath(join(work_dir, s.name), s.l_fpath),
                    s.l_fpath) and can_reuse(
                        make_downsampled_fpath(join(work_dir, s.name),
                                               s.r_fpath), s.r_fpath)
                for s in samples):
            debug('Reusing downsampled FastQ')
            for s in samples:
                s.l_fpath = make_downsampled_fpath(join(work_dir, s.name),
                                                   s.l_fpath)
                s.r_fpath = make_downsampled_fpath(join(work_dir, s.name),
                                                   s.r_fpath)
        else:
            if isinstance(downsample_to, float):
                info('Downsampling FastQ to ' + str(float(downsample_to)) +
                     ' fraction of reads')
            else:
                info('Downsampling FastQ to ' + str(int(downsample_to)) +
                     ' read pairs')
            fastq_pairs = parall_view.run(downsample, [[
                join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath,
                downsample_to,
                num_pairs_by_sample.get(s.name)
            ] for s in samples])
            for s, (l_r, r_r) in zip(samples, fastq_pairs):
                s.l_fpath = l_r
                s.r_fpath = r_r
    else:
        info('Skipping downsampling')

    debug()
    if all(
            can_reuse(make_bam_fpath(join(work_dir, s.name)),
                      [s.l_fpath, s.r_fpath]) for s in samples):
        debug('All downsampled BAM exists, reusing')
        for s in samples:
            s.bam = make_bam_fpath(join(work_dir, s.name))
    else:
        bwa = which('bwa')
        if not isfile(bwa):
            critical('BWA not found under ' + bwa)
        smb = sambamba.get_executable()
        if not (bwa and smb):
            if not bwa:
                err('Error: bwa is required for the alignment pipeline')
            if not smb:
                err('Error: sambamba is required for the alignment pipeline')
            critical('Tools required for alignment not found')
        info('Aligning reads to the reference')
        bam_fpaths = parall_view.run(align, [[
            join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb,
            bwa_prefix, dedup, parall_view.cores_per_job
        ] for s in samples])

        bam_fpaths = [verify_bam(b) for b in bam_fpaths]
        if len(bam_fpaths) < len(samples):
            critical('Some samples were not aligned successfully.')
        for bam, s in zip(bam_fpaths, samples):
            s.bam = bam

    return num_pairs_by_sample
예제 #31
0
def check_genome(genome):
    if genome not in SUPPORTED_GENOMES:
        critical('Genome ' + str(genome) +
                 ' is not supported. Supported genomes: ' +
                 ', '.join(SUPPORTED_GENOMES))
예제 #32
0
def _log(msg, silent, is_critical):
    if is_critical:
        critical(msg)
    if not silent:
        warn(msg)
예제 #33
0
def _annotate(bed,
              ref_bed,
              chr_order,
              fai_fpath,
              work_dir,
              ori_col_num,
              high_confidence=False,
              reannotate=False,
              is_debug=False,
              **kwargs):
    # if genome:
    # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2'))
    # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')')
    # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0])
    # else:

    intersection_bed = None
    intersection_fpath = None

    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    if is_debug:
        intersection_fpath = join(work_dir, 'intersection.bed')
        if isfile(intersection_fpath):
            info('Loading from ' + intersection_fpath)
            intersection_bed = BedTool(intersection_fpath)
    if not intersection_bed:
        if count_bed_cols(fai_fpath) == 2:
            debug('Fai fields size is 2 ' + fai_fpath)
            intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath)
        else:
            debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) +
                  ', not 2')
            intersection_bed = bed.intersect(ref_bed, wao=True)
    if is_debug and not isfile(intersection_fpath):
        intersection_bed.saveas(intersection_fpath)
        debug('Saved intersection to ' + intersection_fpath)

    total_annotated = 0
    total_uniq_annotated = 0
    total_off_target = 0

    met = set()

    overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict(
        lambda: OrderedDefaultDict(lambda: defaultdict(list)))
    # off_targets = list()

    expected_fields_num = ori_col_num + len(ebl.BedCols.cols[:-4]) + 1
    for i, intersection_fields in enumerate(intersection_bed):
        inters_fields_list = list(intersection_fields)
        if len(inters_fields_list) < expected_fields_num:
            critical(
                f'Cannot parse the reference BED file - unexpected number of lines '
                '({len(inters_fields_list} in {inters_fields_list}' +
                ' (less than {expected_fields_num})')

        a_chr, a_start, a_end = intersection_fields[:3]
        a_extra_columns = intersection_fields[3:ori_col_num]

        overlap_fields = [None for _ in ebl.BedCols.cols]

        overlap_fields[:len(intersection_fields[ori_col_num:-1]
                            )] = intersection_fields[ori_col_num:-1]
        keep_gene_column = not reannotate
        a_gene = None
        if keep_gene_column:
            a_gene = a_extra_columns[0]

        e_chr = overlap_fields[0]
        overlap_size = int(intersection_fields[-1])
        assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}'

        # fs = [None for _ in ebl.BedCols.cols]
        # fs[:3] = [a_chr, a_start, a_end]
        reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns))

        if e_chr == '.':
            total_off_target += 1
            # off_targets.append(fs)
            overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(
                list)

        else:
            # fs[3:-1] = db_feature_fields[3:-1]
            total_annotated += 1
            if (a_chr, a_start, a_end) not in met:
                total_uniq_annotated += 1
                met.add((a_chr, a_start, a_end))

            e_gene = overlap_fields[ebl.BedCols.GENE]
            if keep_gene_column and e_gene != a_gene:
                overlaps_by_tx_by_gene_by_loc[reg][
                    a_gene] = OrderedDefaultDict(list)
            else:
                transcript_id = overlap_fields[ebl.BedCols.ENSEMBL_ID]
                overlaps_by_tx_by_gene_by_loc[reg][e_gene][
                    transcript_id].append((overlap_fields, overlap_size))

    info('  Total annotated regions: ' + str(total_annotated))
    info('  Total unique annotated regions: ' + str(total_uniq_annotated))
    info('  Total off target regions: ' + str(total_off_target))
    info('Resolving ambiguities...')
    annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order,
                                     **kwargs)

    return annotated
예제 #34
0
def annotate(input_bed_fpath,
             output_fpath,
             work_dir,
             genome=None,
             reannotate=True,
             high_confidence=False,
             only_canonical=False,
             coding_only=False,
             short=False,
             extended=False,
             is_debug=False,
             **kwargs):

    debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' +
                 ', '.join(ebl.SUPPORTED_GENOMES))

    if genome:
        fai_fpath = reference_data.get_fai(genome)
        chr_order = reference_data.get_chrom_order(genome)
    else:
        fai_fpath = None
        chr_order = bed_chrom_order(input_bed_fpath)

    input_bed_fpath = sort_bed(input_bed_fpath,
                               work_dir=work_dir,
                               chr_order=chr_order,
                               genome=genome)

    ori_bed = BedTool(input_bed_fpath)
    ori_col_num = ori_bed.field_count()
    reannotate = reannotate or ori_col_num == 3
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    ori_bed = BedTool(input_bed_fpath)
    # if reannotate:
    #     bed = BedTool(input_bed_fpath).cut([0, 1, 2])
    #     keep_gene_column = False
    # else:
    #     if col_num > 4:
    #         bed = BedTool(input_bed_fpath).cut([0, 1, 2, 3])
    #     keep_gene_column = True

    # features_bed = features_bed.saveas()
    # cols = features_bed.field_count()
    # if cols < 12:
    #     features_bed = features_bed.each(lambda f: f + ['.']*(12-cols))
    if high_confidence:
        features_bed = features_bed.filter(ebl.high_confidence_filter)
    if only_canonical:
        features_bed = features_bed.filter(
            ebl.get_only_canonical_filter(genome))
    if coding_only:
        features_bed = features_bed.filter(ebl.protein_coding_filter)
    # unique_tx_by_gene = find_best_tx_by_gene(features_bed)

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x: x[
        ebl.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript'])
    # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]])

    info('Overlapping regions with Ensembl data')
    if is_debug:
        ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed'))
        debug(f'Saved regions to {ori_bed.fn}')
        features_bed = features_bed.saveas(join(work_dir, 'features.bed'))
        debug(f'Saved features to {features_bed.fn}')
    annotated = _annotate(ori_bed,
                          features_bed,
                          chr_order,
                          fai_fpath,
                          work_dir,
                          ori_col_num,
                          high_confidence=False,
                          reannotate=reannotate,
                          is_debug=is_debug,
                          **kwargs)

    full_header = [ebl.BedCols.names[i] for i in ebl.BedCols.cols]
    add_ori_extra_fields = ori_col_num > 3
    if not reannotate and ori_col_num == 4:
        add_ori_extra_fields = False  # no need to report the original gene field if we are not re-annotating

    info('Saving annotated regions...')
    total = 0
    with file_transaction(work_dir, output_fpath) as tx:
        with open(tx, 'w') as out:
            header = full_header[:6]
            if short:
                header = full_header[:4]
            if extended:
                header = full_header[:-1]
            if add_ori_extra_fields:
                header.append(full_header[-1])

            if extended:
                out.write(
                    '## ' +
                    ebl.BedCols.names[ebl.BedCols.TX_OVERLAP_PERCENTAGE] +
                    ': part of region overlapping with transcripts\n')
                out.write(
                    '## ' +
                    ebl.BedCols.names[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] +
                    ': part of region overlapping with exons\n')
                out.write(
                    '## ' +
                    ebl.BedCols.names[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] +
                    ': part of region overlapping with protein coding regions\n'
                )
                out.write('\t'.join(header) + '\n')
            for full_fields in annotated:
                fields = full_fields[:6]
                if short:
                    fields = full_fields[:4]
                if extended:
                    fields = full_fields[:-1]
                if add_ori_extra_fields:
                    fields.append(full_fields[-1])

                out.write('\t'.join(map(_format_field, fields)) + '\n')
                total += 1

    debug('Saved ' + str(total) + ' total annotated regions')
    return output_fpath
예제 #35
0
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None,
             reannotate=True, high_confidence=False, only_canonical=False,
             coding_only=False, short=False, extended=False, is_debug=False, **kwargs):

    debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES))

    if genome:
        fai_fpath = reference_data.get_fai(genome)
        chr_order = reference_data.get_chrom_order(genome)
    else:
        fai_fpath = None
        chr_order = bed_chrom_order(input_bed_fpath)

    input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome)

    ori_bed = BedTool(input_bed_fpath)
    ori_col_num = ori_bed.field_count()
    reannotate = reannotate or ori_col_num == 3
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    ori_bed = BedTool(input_bed_fpath)
        # if reannotate:
        #     bed = BedTool(input_bed_fpath).cut([0, 1, 2])
        #     keep_gene_column = False
        # else:
        #     if col_num > 4:
        #         bed = BedTool(input_bed_fpath).cut([0, 1, 2, 3])
        #     keep_gene_column = True

    # features_bed = features_bed.saveas()
    # cols = features_bed.field_count()
    # if cols < 12:
    #     features_bed = features_bed.each(lambda f: f + ['.']*(12-cols))
    if high_confidence:
        features_bed = features_bed.filter(ebl.high_confidence_filter)
    if only_canonical:
        features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome))
    if coding_only:
        features_bed = features_bed.filter(ebl.protein_coding_filter)
    # unique_tx_by_gene = find_best_tx_by_gene(features_bed)

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x:
        x[ebl.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript'])
        # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]])

    info('Overlapping regions with Ensembl data')
    if is_debug:
        ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed'))
        debug(f'Saved regions to {ori_bed.fn}')
        features_bed = features_bed.saveas(join(work_dir, 'features.bed'))
        debug(f'Saved features to {features_bed.fn}')
    annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num,
                          high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs)

    full_header = [ebl.BedCols.names[i] for i in ebl.BedCols.cols]
    add_ori_extra_fields = ori_col_num > 3
    if not reannotate and ori_col_num == 4:
        add_ori_extra_fields = False  # no need to report the original gene field if we are not re-annotating

    info('Saving annotated regions...')
    total = 0
    with file_transaction(work_dir, output_fpath) as tx:
        with open(tx, 'w') as out:
            header = full_header[:6]
            if short:
                header = full_header[:4]
            if extended:
                header = full_header[:-1]
            if add_ori_extra_fields:
                header.append(full_header[-1])

            if extended:
                out.write('## ' + ebl.BedCols.names[ebl.BedCols.TX_OVERLAP_PERCENTAGE] +
                          ': part of region overlapping with transcripts\n')
                out.write('## ' + ebl.BedCols.names[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with exons\n')
                out.write('## ' + ebl.BedCols.names[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with protein coding regions\n')
                out.write('\t'.join(header) + '\n')
            for full_fields in annotated:
                fields = full_fields[:6]
                if short:
                    fields = full_fields[:4]
                if extended:
                    fields = full_fields[:-1]
                if add_ori_extra_fields:
                    fields.append(full_fields[-1])

                out.write('\t'.join(map(_format_field, fields)) + '\n')
                total += 1
    
    debug('Saved ' + str(total) + ' total annotated regions')
    return output_fpath
예제 #36
0
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome +
             ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug(
                'The male non-PAR region does not overlap with the capture target - cannot determine sex.'
            )
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info(
        'Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.'
    )
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' +
              str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
              ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' +
                  str(FEMALE_Y_COVERAGE_FACTOR) +
                  ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' +
                  str(FEMALE_Y_COVERAGE_FACTOR) +
                  ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex
예제 #37
0
def parse_qualimap_results(sample):
    if not verify_file(sample.qualimap_html_fpath):
        critical('QualiMap report was not found')

    qualimap_value_by_metric = report_parser.parse_qualimap_sample_report(sample.qualimap_html_fpath)
    bases_by_depth, median_depth = parse_qualimap_coverage_hist(sample.qualimap_cov_hist_fpath)
    median_gc, median_human_gc = parse_qualimap_gc_content(sample.qualimap_gc_hist_fpath)
    median_ins_size = parse_qualimap_insert_size(sample.qualimap_ins_size_hist_fpath)

    def find_rec(name, percent=False, on_target=True):
        if on_target:
            name_on_target = name + ' (on target)'
            if percent:
                name_on_target += ' %'
            res = qualimap_value_by_metric.get(name_on_target)
            if res:
                return res
        if percent:
            name += ' %'
        return qualimap_value_by_metric.get(name)

    depth_stats = dict(
        ave_depth       = find_rec('Coverage Mean'),
        stddev_depth    = find_rec('Coverage Standard Deviation'),
        median_depth    = median_depth,
        bases_by_depth  = bases_by_depth
    )
    target_stats = dict(
        reference_size  = find_rec('Reference size'),
        target_size     = find_rec('Regions size/percentage of reference'),
        target_fraction = find_rec('Regions size/percentage of reference', percent=True),
    )
    reads_stats = dict(
        total                    = find_rec('Number of reads'),
        mapped                   = find_rec('Mapped reads', on_target=False),
        mapped_rate              = find_rec('Mapped reads', percent=True, on_target=False),
        unmapped                 = find_rec('Unmapped reads'),
        unmapped_rate            = find_rec('Unmapped reads', percent=True),
        mapped_on_target         = find_rec('Mapped reads (on target)'),
        mapped_rate_on_target    = find_rec('Mapped reads (on target)', percent=True),
        mapped_paired            = find_rec('Mapped paired reads', on_target=False),
        mapped_paired_rate       = find_rec('Mapped paired reads', percent=True, on_target=False),
        paired                   = find_rec('Paired reads'),
        paired_rate              = find_rec('Paired reads', percent=True),
        dup                      = find_rec('Duplicated reads (flagged)'),
        dup_rate                 = find_rec('Duplicated reads (flagged)', percent=True),
        min_len                  = find_rec('Read min length'),
        max_len                  = find_rec('Read max length'),
        ave_len                  = find_rec('Read mean length'),
        median_gc                = median_gc,
        median_human_gc          = median_human_gc,
        median_ins_size          = median_ins_size,
    )
    indels_stats = dict(
        mean_mq     = find_rec('Mean Mapping Quality'),
        mismatches  = find_rec('Mismatches'),
        insertions  = find_rec('Insertions'),
        deletions   = find_rec('Deletions'),
        homo_indels = find_rec('Homopolymer indels'),
    )
    return depth_stats, reads_stats, indels_stats, target_stats