def get_chrom_lengths(genome=None, fai_fpath=None): assert genome or fai_fpath, 'One of genome or fai_fpath should be not None: ' \ 'genome=' + str(genome) + ' fai_fpath=' + str(fai_fpath) if not fai_fpath: check_genome(genome) fai_fpath = get_fai(genome) else: fai_fpath = verify_file(fai_fpath, is_critical=True) if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'): critical('Error: .fai or .fa is accepted.') chr_lengths = [] if fai_fpath.endswith('.fa'): debug('Reading genome sequence (.fa) to get chromosome lengths') with open(fai_fpath, 'r') as handle: from Bio import SeqIO reference_records = SeqIO.parse(handle, 'fasta') for record in reference_records: chrom = record.id chr_lengths.append((chrom, len(record.seq))) else: debug('Reading genome index file (.fai) to get chromosome lengths') with open(fai_fpath, 'r') as handle: for line in handle: line = line.strip() if line: chrom, length = line.split()[0], int(line.split()[1]) chr_lengths.append((chrom, length)) return chr_lengths
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of fai_fpath or genome build name must be specified') with file_transaction(work_dir, output_bed_fpath) as tx: run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx) return output_bed_fpath
def read_samples(args): bam_by_sample = find_bams(args) if bam_by_sample: info('Found ' + str(len(bam_by_sample)) + ' BAM file' + ('s' if len(bam_by_sample) > 1 else '')) input_not_bam = [ verify_file(fpath) for fpath in args if adjust_path(fpath) not in bam_by_sample ] input_not_bam = [fpath for fpath in input_not_bam if fpath] fastqs_by_sample = dict() if not input_not_bam and not bam_by_sample: critical('No correct input files') if input_not_bam: info('Input ' + str(len(input_not_bam)) + ' correct input non-BAM files') fastqs_by_sample = find_fastq_pairs(input_not_bam) if fastqs_by_sample: info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs') intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys()) if intersection: critical('The following samples both had input BAMs and FastQ: ' + ', '.join(list(intersection))) return fastqs_by_sample, bam_by_sample
def tx_tmpdir(base_dir, rollback_dirpath): """Context manager to create and remove a transactional temporary directory. """ # tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4())) # unique_attempts = 0 # while os.path.exists(tmp_dir_base): # if unique_attempts > 5: # break # tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4())) # time.sleep(1) # unique_attempts += 1 # if base_dir is not None: # tmp_dir_base = os.path.join(base_dir, "tx") # else: # tmp_dir_base = os.path.join(os.getcwd(), "tx") if exists(rollback_dirpath): critical(rollback_dirpath + ' already exists') tmp_dir = tempfile.mkdtemp(dir=base_dir) safe_mkdir(tmp_dir) try: yield tmp_dir finally: if tmp_dir and exists(tmp_dir): os.rename(tmp_dir, rollback_dirpath)
def main(): options = [ (['-g', '--genome'], dict( dest='genome', help='Genome build. Accepted values: ' + ', '.join(ebl.SUPPORTED_GENOMES), )), ] parser = OptionParser() for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if not opts.genome: critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)') genome = opts.genome debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ebl.BedCols.FEATURE] == 'CDS') features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome)) info('Saving CDS regions...') output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed')) with file_transaction(None, output_fpath) as tx: features_bed.cut(range(6)).saveas(tx) info('Done, saved to ' + output_fpath)
def _get(relative_path, genome=None): """ :param relative_path: relative path of the file inside the repository :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20, in case of BED, the returning BedTool will be with added filter. :return: BedTools object if it's a BED file, or filepath """ chrom = None if genome: if '-chr' in genome: genome, chrom = genome.split('-') check_genome(genome) relative_path = relative_path.format(genome=genome) path = abspath(join(dirname(__file__), relative_path)) if not isfile(path) and isfile(path + '.gz'): path += '.gz' if path.endswith('.bed') or path.endswith('.bed.gz'): if path.endswith('.bed.gz'): bedtools = which('bedtools') if not bedtools: critical('bedtools not found in PATH: ' + str(os.environ['PATH'])) debug('BED is compressed, creating BedTool') bed = BedTool(path) else: debug('BED is uncompressed, creating BedTool') bed = BedTool(path) if chrom: debug('Filtering BEDTool for chrom ' + chrom) bed = bed.filter(lambda r: r.chrom == chrom) return bed else: return path
def read_biomart(genome_name): features_by_ens_id = dict() bm_fpath = ebl.biomart_fpath(genome_name) if not verify_file(bm_fpath): warn('Warning: biomart file for genome ' + genome_name + ' not found, skip using the TSL values') return dict() with open(bm_fpath) as f: for r in csv.DictReader(f, delimiter='\t'): features_by_ens_id[r['Transcript ID']] = r # hg38 version has TSL, checking if we can populate some TSL from it if not genome_name.startswith('hg38'): bm_fpath = ebl.biomart_fpath('hg38') if not verify_file(bm_fpath): critical( 'Biomart for hg38 file not found, and needed for TSL values') with open(bm_fpath) as f: for r in csv.DictReader(f, delimiter='\t'): if r['Transcript ID'] not in features_by_ens_id: features_by_ens_id[r['Transcript ID']] = r else: features_by_ens_id[r['Transcript ID']][ 'Transcript Support Level (TSL)'] = r[ 'Transcript Support Level (TSL)'] return features_by_ens_id
def find_bams(args): bam_by_sample = OrderedDict() bad_bam_fpaths = [] good_args = [] for arg in args: # /ngs/oncology/Analysis/bioscience/Bio_0038_KudosCellLinesExomes/Bio_0038_150521_D00443_0159_AHK2KTADXX/bcbio,Kudos159 /ngs/oncology/Analysis/bioscience/Bio_0038_KudosCellLinesExomes/Bio_0038_150521_D00443_0160_BHKWMNADXX/bcbio,Kudos160 fpath = arg.split(',')[0] fname, ext = splitext(fpath) if ext == '.bam': bam_fpath = verify_bam(fpath) if not bam_fpath: bad_bam_fpaths.append(fpath) else: if len(arg.split(',')) > 1: sname = arg.split(',')[1] else: sname = basename(splitext(bam_fpath)[0]) bam_by_sample[sname] = bam_fpath good_args.append(arg) if bad_bam_fpaths: critical('BAM files cannot be found, empty or not BAMs: ' + ', '.join(bad_bam_fpaths)) for arg in good_args: args.remove(arg) return bam_by_sample
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.') return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def load_yaml_config(fpath): verify_file(fpath, is_critical=True) try: dic = load_yaml(open(fpath)) except Exception: err(format_exc()) critical('Could not parse bcbio YAML ' + fpath) else: return dic
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath regions = [] if not chr_order: if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical( 'Either of chr_order, fai_fpath, or genome build name must be specified' ) chr_order = get_chrom_order(fai_fpath=fai_fpath) with open(input_bed_fpath) as f: with file_transaction(work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append( Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath) return output_bed_fpath
def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1): info('Analysing ' + bam_fpath) safe_mkdir(dirname(output_dir)) safe_mkdir(output_dir) mem_cmdl = '' mem_m = get_qualimap_max_mem(bam_fpath) mem = str(int(mem_m)) + 'M' mem_cmdl = '--java-mem-size=' + mem cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 ' '-bam {bam_fpath} -outdir {output_dir}') if genome.startswith('hg') or genome.startswith('GRCh'): cmdline += ' -gd HUMAN' if genome.startswith('mm'): cmdline += ' -gd MOUSE' if bed_fpath: cmdline += ' -gff {bed_fpath}' debug('Using amplicons/capture panel ' + bed_fpath) cmdline = cmdline.format(**locals()) if not all( can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): for fp in output_fpaths: if isfile(fp): os.remove(fp) try: run(cmdline, env_vars=dict(DISPLAY=None)) except subprocess.CalledProcessError as e: if 'The alignment file is unsorted.' in e.output: info() info('BAM file is unsorted; trying to sort and rerun QualiMap') sorted_bam_fpath = sort_bam(bam_fpath) cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath) run(cmdline, env_vars=dict(DISPLAY=None)) if not all( verify_file( fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): critical('Some of the QualiMap results were not generated') return output_dir
def find_fastq_pairs(fpaths): info('Finding FastQ pairs...') fastqs_by_sample_name = dict() for fpath in fpaths: fn, ext = splitext_plus(basename(fpath)) if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']: sname, l_fpath, r_fpath = None, None, None if fn.endswith('_1'): sname = fn[:-2] l_fpath = fpath if fn.endswith('_R1'): sname = fn[:-3] l_fpath = fpath if fn.endswith('_2'): sname = fn[:-2] r_fpath = fpath if fn.endswith('_R2'): sname = fn[:-3] r_fpath = fpath if sname: m = re.match(r'(.*)_S\d+', sname) if m: sname = m.group(1) sname = sname.replace('-', '_') else: sname = fn info('Cannot detect file for ' + sname) l, r = fastqs_by_sample_name.get(sname, (None, None)) if l and l_fpath: critical('Duplicated left FastQ files for ' + sname + ': ' + l + ' and ' + l_fpath) if r and r_fpath: critical('Duplicated right FastQ files for ' + sname + ': ' + r + ' and ' + r_fpath) fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath fixed_fastqs_by_sample_name = dict() for sname, (l, r) in fastqs_by_sample_name.items(): if not l: err('ERROR: for sample ' + sname + ', left reads not found') if not r: err('ERROR: for sample ' + sname + ', right reads not found') if l and r: fixed_fastqs_by_sample_name[sname] = l, r return fixed_fastqs_by_sample_name
def annotate_target(work_dir, target_bed, genome_build): output_fpath = intermediate_fname(work_dir, target_bed, 'ann') if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath annotate_bed_py = which('annotate_bed.py') if not annotate_bed_py: critical( 'Error: annotate_bed.py not found in PATH, please install TargQC.') cmdline = '{annotate_bed_py} {target_bed} -g {genome_build} -o {output_fpath}'.format( **locals()) run(cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = clean_bed(output_fpath, work_dir) return output_fpath
def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1): info('Analysing ' + bam_fpath) safe_mkdir(dirname(output_dir)) safe_mkdir(output_dir) mem_cmdl = '' mem_m = get_qualimap_max_mem(bam_fpath) mem = str(int(mem_m)) + 'M' mem_cmdl = '--java-mem-size=' + mem cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 ' '-bam {bam_fpath} -outdir {output_dir}') if genome.startswith('hg') or genome.startswith('GRCh'): cmdline += ' -gd HUMAN' if genome.startswith('mm'): cmdline += ' -gd MOUSE' if bed_fpath: cmdline += ' -gff {bed_fpath}' debug('Using amplicons/capture panel ' + bed_fpath) cmdline = cmdline.format(**locals()) if not all(can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): for fp in output_fpaths: if isfile(fp): os.remove(fp) try: run(cmdline, env_vars=dict(DISPLAY=None)) except subprocess.CalledProcessError as e: if 'The alignment file is unsorted.' in e.output: info() info('BAM file is unsorted; trying to sort and rerun QualiMap') sorted_bam_fpath = sort_bam(bam_fpath) cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath) run(cmdline, env_vars=dict(DISPLAY=None)) if not all(verify_file(fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): critical('Some of the QualiMap results were not generated') return output_dir
def calc_bases_within_threshs(bases_by_depth, total_size, depth_thresholds): bases_within_threshs = OrderedDict( (depth, 0) for depth in depth_thresholds) rates_within_threshs = OrderedDict( (depth, None) for depth in depth_thresholds) for depth, bases in bases_by_depth.items(): for t in depth_thresholds: if depth >= t: bases_within_threshs[t] += bases for t in depth_thresholds: bs = bases_within_threshs[t] if total_size > 0: rate = 1.0 * bases_within_threshs[t] / total_size if rate > 1: critical('Error: rate is > 1: rate = ' + str(rate) + ', bases = ' + str(bs) + ', size = ' + str(total_size)) rates_within_threshs[t] = rate return bases_within_threshs, rates_within_threshs
def read_biomart(genome_name): features_by_ens_id = dict() bm_fpath = ebl.biomart_fpath(genome_name) if not verify_file(bm_fpath): warn('Warning: biomart file for genome ' + genome_name + ' not found, skip using the TSL values') return dict() with open(bm_fpath) as f: for r in csv.DictReader(f, delimiter='\t'): features_by_ens_id[r['Transcript ID']] = r # hg38 version has TSL, checking if we can populate some TSL from it if not genome_name.startswith('hg38'): bm_fpath = ebl.biomart_fpath('hg38') if not verify_file(bm_fpath): critical('Biomart for hg38 file not found, and needed for TSL values') with open(bm_fpath) as f: for r in csv.DictReader(f, delimiter='\t'): if r['Transcript ID'] not in features_by_ens_id: features_by_ens_id[r['Transcript ID']] = r else: features_by_ens_id[r['Transcript ID']]['Transcript Support Level (TSL)'] = r[ 'Transcript Support Level (TSL)'] return features_by_ens_id
def safe_mkdir(dirpath, descriptive_name=''): if isdir(dirpath): return dirpath if not dirpath: critical(descriptive_name + ' path is empty.') if isfile(dirpath): critical(descriptive_name + ' ' + dirpath + ' is a file.') num_tries = 0 max_tries = 10 while not exists(dirpath): # we could get an error here if multiple processes are creating # the directory at the same time. Grr, concurrency. try: os.makedirs(dirpath) except OSError as e: if num_tries > max_tries: raise num_tries += 1 time.sleep(2) return dirpath
def find_executable(): executable = which('qualimap') if not executable: critical('Error: "qualimap" executable is not found in PATH') return executable
def get_executable(): sys_path = which('sambamba') if not sys_path: critical('Error: sambamba executable is not found') return sys_path
def _log(msg, silent, is_critical): if is_critical: critical(msg) if not silent: warn(msg)
def start_targqc( work_dir, output_dir, samples, target_bed_fpath, parallel_cfg, bwa_prefix, fai_fpath=None, genome=config.genome, depth_threshs=config.depth_thresholds, downsample_to=config.downsample_fraction, padding=config.padding, dedup=config.dedup, num_pairs_by_sample=None, reannotate=config.reannotate, ): d = get_description() info('*' * len(d)) info(d) info('*' * len(d)) info() fai_fpath = fai_fpath or ref.get_fai(genome) target = Target(work_dir, output_dir, fai_fpath, padding=padding, bed_fpath=target_bed_fpath, reannotate=reannotate, genome=genome, is_debug=logger.is_debug) fastq_samples = [ s for s in samples if not s.bam and s.l_fpath and s.r_fpath ] from targqc.utilz.parallel import parallel_view if fastq_samples: if not bwa_prefix: critical('--bwa-prefix is required when running from fastq') with parallel_view(len(fastq_samples), parallel_cfg, join(work_dir, 'sge_fastq')) as view: num_pairs_by_sample = proc_fastq(fastq_samples, view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample, dedup=dedup) info() for s in samples: if s.bam: info(s.name + ': using alignment ' + s.bam) with parallel_view(len(samples), parallel_cfg, join(work_dir, 'sge_bam')) as view: info('Sorting BAMs...') sorted_bams = view.run( sort_bam, [[s.bam, safe_mkdir(join(work_dir, s.name))] for s in samples]) for s, sorted_bam in zip(samples, sorted_bams): s.bam = sorted_bam if all(can_reuse(s.bam + '.bai', s.bam) for s in samples): debug('BAM indexes exists') else: info('Indexing BAMs...') view.run(index_bam, [[s.bam] for s in samples]) info('Making general reports...') make_general_reports(view, samples, target, genome, depth_threshs, padding, num_pairs_by_sample, is_debug=logger.is_debug, reannotate=reannotate, fai_fpath=fai_fpath) info() info('*' * 70) tsv_fpath, html_fpath = make_tarqc_html_report(output_dir, work_dir, samples, bed_fpath=target_bed_fpath) info('TargQC summary saved in: ') info(' ' + html_fpath) info(' ' + tsv_fpath) info() with parallel_view(len(samples), parallel_cfg, join(work_dir, 'sge_bam')) as view: info('Making region-level reports...') make_region_reports(view, work_dir, samples, target, genome, depth_threshs) info() info('*' * 70) tsv_region_rep_fpath = combined_regional_reports(work_dir, output_dir, samples) info() info('*' * 70) info('TargQC summary saved in: ') info(' ' + html_fpath) info(' ' + tsv_fpath) info('Per-region coverage statistics saved into:') info(' ' + tsv_region_rep_fpath) return html_fpath
def proc_fastq(samples, parall_view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample=None, dedup=True): num_pairs_by_sample = num_pairs_by_sample or dict() if downsample_to: # Read pairs counts debug() if all(s.name in num_pairs_by_sample for s in samples): debug('Using read pairs counts extracted from FastQC reports') elif all(can_reuse(make_pair_counts_fpath(join(work_dir, s.name)), s.l_fpath) for s in samples): debug('Reusing pairs counts, reading from files') num_pairs_by_sample = {s.name: int(open(make_pair_counts_fpath(join(work_dir, s.name))).read().strip()) for s in samples} else: info('Counting read pairs') num_pairs = parall_view.run(count_read_pairs, [[s.name, safe_mkdir(join(work_dir, s.name)), s.l_fpath] for s in samples]) num_pairs_by_sample = {s.name: pairs_count for s, pairs_count in zip(samples, num_pairs)} # Downsampling debug() if all(can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.l_fpath), s.l_fpath) and can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.r_fpath), s.r_fpath) for s in samples): debug('Reusing downsampled FastQ') for s in samples: s.l_fpath = make_downsampled_fpath(join(work_dir, s.name), s.l_fpath) s.r_fpath = make_downsampled_fpath(join(work_dir, s.name), s.r_fpath) else: if isinstance(downsample_to, float): info('Downsampling FastQ to ' + str(float(downsample_to)) + ' fraction of reads') else: info('Downsampling FastQ to ' + str(int(downsample_to)) + ' read pairs') fastq_pairs = parall_view.run(downsample, [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, downsample_to, num_pairs_by_sample.get(s.name)] for s in samples]) for s, (l_r, r_r) in zip(samples, fastq_pairs): s.l_fpath = l_r s.r_fpath = r_r else: info('Skipping downsampling') debug() if all(can_reuse(make_bam_fpath(join(work_dir, s.name)), [s.l_fpath, s.r_fpath]) for s in samples): debug('All downsampled BAM exists, reusing') for s in samples: s.bam = make_bam_fpath(join(work_dir, s.name)) else: bwa = which('bwa') if not isfile(bwa): critical('BWA not found under ' + bwa) smb = sambamba.get_executable() if not (bwa and smb): if not bwa: err('Error: bwa is required for the alignment pipeline') if not smb: err('Error: sambamba is required for the alignment pipeline') critical('Tools required for alignment not found') info('Aligning reads to the reference') bam_fpaths = parall_view.run(align, [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb, bwa_prefix, dedup, parall_view.cores_per_job] for s in samples]) bam_fpaths = [verify_bam(b) for b in bam_fpaths] if len(bam_fpaths) < len(samples): critical('Some samples were not aligned successfully.') for bam, s in zip(bam_fpaths, samples): s.bam = bam return num_pairs_by_sample
def _annotate(bed, ref_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=False, is_debug=False, **kwargs): # if genome: # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2')) # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')') # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0]) # else: intersection_bed = None intersection_fpath = None pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) if is_debug: intersection_fpath = join(work_dir, 'intersection.bed') if isfile(intersection_fpath): info('Loading from ' + intersection_fpath) intersection_bed = BedTool(intersection_fpath) if not intersection_bed: if count_bed_cols(fai_fpath) == 2: debug('Fai fields size is 2 ' + fai_fpath) intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath) else: debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) + ', not 2') intersection_bed = bed.intersect(ref_bed, wao=True) if is_debug and not isfile(intersection_fpath): intersection_bed.saveas(intersection_fpath) debug('Saved intersection to ' + intersection_fpath) total_annotated = 0 total_uniq_annotated = 0 total_off_target = 0 met = set() overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict(lambda: OrderedDefaultDict(lambda: defaultdict(list))) # off_targets = list() expected_fields_num = ori_col_num + len(ebl.BedCols.cols[:-4]) + 1 for i, intersection_fields in enumerate(intersection_bed): inters_fields_list = list(intersection_fields) if len(inters_fields_list) < expected_fields_num: critical(f'Cannot parse the reference BED file - unexpected number of lines ' '({len(inters_fields_list} in {inters_fields_list}' + ' (less than {expected_fields_num})') a_chr, a_start, a_end = intersection_fields[:3] a_extra_columns = intersection_fields[3:ori_col_num] overlap_fields = [None for _ in ebl.BedCols.cols] overlap_fields[:len(intersection_fields[ori_col_num:-1])] = intersection_fields[ori_col_num:-1] keep_gene_column = not reannotate a_gene = None if keep_gene_column: a_gene = a_extra_columns[0] e_chr = overlap_fields[0] overlap_size = int(intersection_fields[-1]) assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}' # fs = [None for _ in ebl.BedCols.cols] # fs[:3] = [a_chr, a_start, a_end] reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns)) if e_chr == '.': total_off_target += 1 # off_targets.append(fs) overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list) else: # fs[3:-1] = db_feature_fields[3:-1] total_annotated += 1 if (a_chr, a_start, a_end) not in met: total_uniq_annotated += 1 met.add((a_chr, a_start, a_end)) e_gene = overlap_fields[ebl.BedCols.GENE] if keep_gene_column and e_gene != a_gene: overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list) else: transcript_id = overlap_fields[ebl.BedCols.ENSEMBL_ID] overlaps_by_tx_by_gene_by_loc[reg][e_gene][transcript_id].append((overlap_fields, overlap_size)) info(' Total annotated regions: ' + str(total_annotated)) info(' Total unique annotated regions: ' + str(total_uniq_annotated)) info(' Total off target regions: ' + str(total_off_target)) info('Resolving ambiguities...') annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order, **kwargs) return annotated
def determine_sex(work_dir, bam_fpath, ave_depth, genome, target_bed=None): info() info('Determining sex') male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = male_bed.count() info('Male region total size: ' + str(male_area_size)) if target_bed: male_bed = BedTool(target_bed).intersect(male_bed).merge() target_male_area_size = male_bed.count() if target_male_area_size < male_area_size * MALE_TARGET_REGIONS_FACTOR: info('Target male region total size is ' + str(target_male_area_size) + ', which is less than the ' + 'checked male regions size * ' + str(MALE_TARGET_REGIONS_FACTOR) + ' (' + str(male_area_size * MALE_TARGET_REGIONS_FACTOR) + ') - cannot determine sex') return None else: info('Target male region total size is ' + str(target_male_area_size) + ', which is higher than the ' + 'checked male regions size * ' + str(MALE_TARGET_REGIONS_FACTOR) + ' (' + str(male_area_size * MALE_TARGET_REGIONS_FACTOR) + '). ' + 'Determining sex based on coverage in those regions.') else: info('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_cov_output_fpath = sambamba_depth(work_dir, male_bed, bam_fpath, []) chry_mean_coverage = get_mean_cov(chry_cov_output_fpath) info('Y key regions average depth: ' + str(chry_mean_coverage)) ave_depth = float(ave_depth) info('Sample average depth: ' + str(ave_depth)) if ave_depth < AVE_DEPTH_THRESHOLD_TO_DETERMINE_SEX: info('Sample average depth is too low (less then ' + str(AVE_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: info('Y depth is 0 - it\s female') sex = 'F' else: factor = ave_depth / chry_mean_coverage info('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage info('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: info('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' info('Sex is ' + sex) info() return sex
def proc_fastq(samples, parall_view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample=None, dedup=True): num_pairs_by_sample = num_pairs_by_sample or dict() if downsample_to: # Read pairs counts debug() if all(s.name in num_pairs_by_sample for s in samples): debug('Using read pairs counts extracted from FastQC reports') elif all( can_reuse(make_pair_counts_fpath(join(work_dir, s.name)), s.l_fpath) for s in samples): debug('Reusing pairs counts, reading from files') num_pairs_by_sample = { s.name: int( open(make_pair_counts_fpath(join(work_dir, s.name))).read().strip()) for s in samples } else: info('Counting read pairs') num_pairs = parall_view.run( count_read_pairs, [[s.name, safe_mkdir(join(work_dir, s.name)), s.l_fpath] for s in samples]) num_pairs_by_sample = { s.name: pairs_count for s, pairs_count in zip(samples, num_pairs) } # Downsampling debug() if all( can_reuse( make_downsampled_fpath(join(work_dir, s.name), s.l_fpath), s.l_fpath) and can_reuse( make_downsampled_fpath(join(work_dir, s.name), s.r_fpath), s.r_fpath) for s in samples): debug('Reusing downsampled FastQ') for s in samples: s.l_fpath = make_downsampled_fpath(join(work_dir, s.name), s.l_fpath) s.r_fpath = make_downsampled_fpath(join(work_dir, s.name), s.r_fpath) else: if isinstance(downsample_to, float): info('Downsampling FastQ to ' + str(float(downsample_to)) + ' fraction of reads') else: info('Downsampling FastQ to ' + str(int(downsample_to)) + ' read pairs') fastq_pairs = parall_view.run(downsample, [[ join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, downsample_to, num_pairs_by_sample.get(s.name) ] for s in samples]) for s, (l_r, r_r) in zip(samples, fastq_pairs): s.l_fpath = l_r s.r_fpath = r_r else: info('Skipping downsampling') debug() if all( can_reuse(make_bam_fpath(join(work_dir, s.name)), [s.l_fpath, s.r_fpath]) for s in samples): debug('All downsampled BAM exists, reusing') for s in samples: s.bam = make_bam_fpath(join(work_dir, s.name)) else: bwa = which('bwa') if not isfile(bwa): critical('BWA not found under ' + bwa) smb = sambamba.get_executable() if not (bwa and smb): if not bwa: err('Error: bwa is required for the alignment pipeline') if not smb: err('Error: sambamba is required for the alignment pipeline') critical('Tools required for alignment not found') info('Aligning reads to the reference') bam_fpaths = parall_view.run(align, [[ join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb, bwa_prefix, dedup, parall_view.cores_per_job ] for s in samples]) bam_fpaths = [verify_bam(b) for b in bam_fpaths] if len(bam_fpaths) < len(samples): critical('Some samples were not aligned successfully.') for bam, s in zip(bam_fpaths, samples): s.bam = bam return num_pairs_by_sample
def check_genome(genome): if genome not in SUPPORTED_GENOMES: critical('Genome ' + str(genome) + ' is not supported. Supported genomes: ' + ', '.join(SUPPORTED_GENOMES))
def _annotate(bed, ref_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=False, is_debug=False, **kwargs): # if genome: # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2')) # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')') # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0]) # else: intersection_bed = None intersection_fpath = None pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) if is_debug: intersection_fpath = join(work_dir, 'intersection.bed') if isfile(intersection_fpath): info('Loading from ' + intersection_fpath) intersection_bed = BedTool(intersection_fpath) if not intersection_bed: if count_bed_cols(fai_fpath) == 2: debug('Fai fields size is 2 ' + fai_fpath) intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath) else: debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) + ', not 2') intersection_bed = bed.intersect(ref_bed, wao=True) if is_debug and not isfile(intersection_fpath): intersection_bed.saveas(intersection_fpath) debug('Saved intersection to ' + intersection_fpath) total_annotated = 0 total_uniq_annotated = 0 total_off_target = 0 met = set() overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict( lambda: OrderedDefaultDict(lambda: defaultdict(list))) # off_targets = list() expected_fields_num = ori_col_num + len(ebl.BedCols.cols[:-4]) + 1 for i, intersection_fields in enumerate(intersection_bed): inters_fields_list = list(intersection_fields) if len(inters_fields_list) < expected_fields_num: critical( f'Cannot parse the reference BED file - unexpected number of lines ' '({len(inters_fields_list} in {inters_fields_list}' + ' (less than {expected_fields_num})') a_chr, a_start, a_end = intersection_fields[:3] a_extra_columns = intersection_fields[3:ori_col_num] overlap_fields = [None for _ in ebl.BedCols.cols] overlap_fields[:len(intersection_fields[ori_col_num:-1] )] = intersection_fields[ori_col_num:-1] keep_gene_column = not reannotate a_gene = None if keep_gene_column: a_gene = a_extra_columns[0] e_chr = overlap_fields[0] overlap_size = int(intersection_fields[-1]) assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}' # fs = [None for _ in ebl.BedCols.cols] # fs[:3] = [a_chr, a_start, a_end] reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns)) if e_chr == '.': total_off_target += 1 # off_targets.append(fs) overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict( list) else: # fs[3:-1] = db_feature_fields[3:-1] total_annotated += 1 if (a_chr, a_start, a_end) not in met: total_uniq_annotated += 1 met.add((a_chr, a_start, a_end)) e_gene = overlap_fields[ebl.BedCols.GENE] if keep_gene_column and e_gene != a_gene: overlaps_by_tx_by_gene_by_loc[reg][ a_gene] = OrderedDefaultDict(list) else: transcript_id = overlap_fields[ebl.BedCols.ENSEMBL_ID] overlaps_by_tx_by_gene_by_loc[reg][e_gene][ transcript_id].append((overlap_fields, overlap_size)) info(' Total annotated regions: ' + str(total_annotated)) info(' Total unique annotated regions: ' + str(total_uniq_annotated)) info(' Total off target regions: ' + str(total_off_target)) info('Resolving ambiguities...') annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order, **kwargs) return annotated
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None, reannotate=True, high_confidence=False, only_canonical=False, coding_only=False, short=False, extended=False, is_debug=False, **kwargs): debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) if genome: fai_fpath = reference_data.get_fai(genome) chr_order = reference_data.get_chrom_order(genome) else: fai_fpath = None chr_order = bed_chrom_order(input_bed_fpath) input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome) ori_bed = BedTool(input_bed_fpath) ori_col_num = ori_bed.field_count() reannotate = reannotate or ori_col_num == 3 pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) ori_bed = BedTool(input_bed_fpath) # if reannotate: # bed = BedTool(input_bed_fpath).cut([0, 1, 2]) # keep_gene_column = False # else: # if col_num > 4: # bed = BedTool(input_bed_fpath).cut([0, 1, 2, 3]) # keep_gene_column = True # features_bed = features_bed.saveas() # cols = features_bed.field_count() # if cols < 12: # features_bed = features_bed.each(lambda f: f + ['.']*(12-cols)) if high_confidence: features_bed = features_bed.filter(ebl.high_confidence_filter) if only_canonical: features_bed = features_bed.filter( ebl.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ebl.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ ebl.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript']) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') if is_debug: ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed')) debug(f'Saved regions to {ori_bed.fn}') features_bed = features_bed.saveas(join(work_dir, 'features.bed')) debug(f'Saved features to {features_bed.fn}') annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs) full_header = [ebl.BedCols.names[i] for i in ebl.BedCols.cols] add_ori_extra_fields = ori_col_num > 3 if not reannotate and ori_col_num == 4: add_ori_extra_fields = False # no need to report the original gene field if we are not re-annotating info('Saving annotated regions...') total = 0 with file_transaction(work_dir, output_fpath) as tx: with open(tx, 'w') as out: header = full_header[:6] if short: header = full_header[:4] if extended: header = full_header[:-1] if add_ori_extra_fields: header.append(full_header[-1]) if extended: out.write( '## ' + ebl.BedCols.names[ebl.BedCols.TX_OVERLAP_PERCENTAGE] + ': part of region overlapping with transcripts\n') out.write( '## ' + ebl.BedCols.names[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] + ': part of region overlapping with exons\n') out.write( '## ' + ebl.BedCols.names[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] + ': part of region overlapping with protein coding regions\n' ) out.write('\t'.join(header) + '\n') for full_fields in annotated: fields = full_fields[:6] if short: fields = full_fields[:4] if extended: fields = full_fields[:-1] if add_ori_extra_fields: fields.append(full_fields[-1]) out.write('\t'.join(map(_format_field, fields)) + '\n') total += 1 debug('Saved ' + str(total) + ' total annotated regions') return output_fpath
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None, reannotate=True, high_confidence=False, only_canonical=False, coding_only=False, short=False, extended=False, is_debug=False, **kwargs): debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) if genome: fai_fpath = reference_data.get_fai(genome) chr_order = reference_data.get_chrom_order(genome) else: fai_fpath = None chr_order = bed_chrom_order(input_bed_fpath) input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome) ori_bed = BedTool(input_bed_fpath) ori_col_num = ori_bed.field_count() reannotate = reannotate or ori_col_num == 3 pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) ori_bed = BedTool(input_bed_fpath) # if reannotate: # bed = BedTool(input_bed_fpath).cut([0, 1, 2]) # keep_gene_column = False # else: # if col_num > 4: # bed = BedTool(input_bed_fpath).cut([0, 1, 2, 3]) # keep_gene_column = True # features_bed = features_bed.saveas() # cols = features_bed.field_count() # if cols < 12: # features_bed = features_bed.each(lambda f: f + ['.']*(12-cols)) if high_confidence: features_bed = features_bed.filter(ebl.high_confidence_filter) if only_canonical: features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ebl.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ebl.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript']) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') if is_debug: ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed')) debug(f'Saved regions to {ori_bed.fn}') features_bed = features_bed.saveas(join(work_dir, 'features.bed')) debug(f'Saved features to {features_bed.fn}') annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs) full_header = [ebl.BedCols.names[i] for i in ebl.BedCols.cols] add_ori_extra_fields = ori_col_num > 3 if not reannotate and ori_col_num == 4: add_ori_extra_fields = False # no need to report the original gene field if we are not re-annotating info('Saving annotated regions...') total = 0 with file_transaction(work_dir, output_fpath) as tx: with open(tx, 'w') as out: header = full_header[:6] if short: header = full_header[:4] if extended: header = full_header[:-1] if add_ori_extra_fields: header.append(full_header[-1]) if extended: out.write('## ' + ebl.BedCols.names[ebl.BedCols.TX_OVERLAP_PERCENTAGE] + ': part of region overlapping with transcripts\n') out.write('## ' + ebl.BedCols.names[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] + ': part of region overlapping with exons\n') out.write('## ' + ebl.BedCols.names[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] + ': part of region overlapping with protein coding regions\n') out.write('\t'.join(header) + '\n') for full_fields in annotated: fields = full_fields[:6] if short: fields = full_fields[:4] if extended: fields = full_fields[:-1] if add_ori_extra_fields: fields.append(full_fields[-1]) out.write('\t'.join(map(_format_field, fields)) + '\n') total += 1 debug('Saved ' + str(total) + ' total annotated regions') return output_fpath
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug( 'The male non-PAR region does not overlap with the capture target - cannot determine sex.' ) return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info( 'Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.' ) if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def parse_qualimap_results(sample): if not verify_file(sample.qualimap_html_fpath): critical('QualiMap report was not found') qualimap_value_by_metric = report_parser.parse_qualimap_sample_report(sample.qualimap_html_fpath) bases_by_depth, median_depth = parse_qualimap_coverage_hist(sample.qualimap_cov_hist_fpath) median_gc, median_human_gc = parse_qualimap_gc_content(sample.qualimap_gc_hist_fpath) median_ins_size = parse_qualimap_insert_size(sample.qualimap_ins_size_hist_fpath) def find_rec(name, percent=False, on_target=True): if on_target: name_on_target = name + ' (on target)' if percent: name_on_target += ' %' res = qualimap_value_by_metric.get(name_on_target) if res: return res if percent: name += ' %' return qualimap_value_by_metric.get(name) depth_stats = dict( ave_depth = find_rec('Coverage Mean'), stddev_depth = find_rec('Coverage Standard Deviation'), median_depth = median_depth, bases_by_depth = bases_by_depth ) target_stats = dict( reference_size = find_rec('Reference size'), target_size = find_rec('Regions size/percentage of reference'), target_fraction = find_rec('Regions size/percentage of reference', percent=True), ) reads_stats = dict( total = find_rec('Number of reads'), mapped = find_rec('Mapped reads', on_target=False), mapped_rate = find_rec('Mapped reads', percent=True, on_target=False), unmapped = find_rec('Unmapped reads'), unmapped_rate = find_rec('Unmapped reads', percent=True), mapped_on_target = find_rec('Mapped reads (on target)'), mapped_rate_on_target = find_rec('Mapped reads (on target)', percent=True), mapped_paired = find_rec('Mapped paired reads', on_target=False), mapped_paired_rate = find_rec('Mapped paired reads', percent=True, on_target=False), paired = find_rec('Paired reads'), paired_rate = find_rec('Paired reads', percent=True), dup = find_rec('Duplicated reads (flagged)'), dup_rate = find_rec('Duplicated reads (flagged)', percent=True), min_len = find_rec('Read min length'), max_len = find_rec('Read max length'), ave_len = find_rec('Read mean length'), median_gc = median_gc, median_human_gc = median_human_gc, median_ins_size = median_ins_size, ) indels_stats = dict( mean_mq = find_rec('Mean Mapping Quality'), mismatches = find_rec('Mismatches'), insertions = find_rec('Insertions'), deletions = find_rec('Deletions'), homo_indels = find_rec('Homopolymer indels'), ) return depth_stats, reads_stats, indels_stats, target_stats