def _merge_target_information(samples, metrics_dir): out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml")) if utils.file_exists(out_file): return samples genomes = set(dd.get_genome_build(data) for data in samples) coverage_beds = set(dd.get_coverage(data) for data in samples) original_variant_regions = set(dd.get_variant_regions_orig(data) for data in samples) data = samples[0] info = {} # Reporting in MultiQC only if the genome is the same across all samples if len(genomes) == 1: info["genome_info"] = { "name": dd.get_genome_build(data), "size": sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]), } # Reporting in MultiQC only if the target is the same across all samples vcr_orig = None if len(original_variant_regions) == 1 and list(original_variant_regions)[0] is not None: vcr_orig = list(original_variant_regions)[0] vcr_clean = bedutils.clean_file(vcr_orig, data) info["variants_regions_info"] = { "bed": vcr_orig, "size": sum(len(x) for x in pybedtools.BedTool(dd.get_variant_regions_merged(data))), "regions": pybedtools.BedTool(vcr_clean).count(), } gene_num = annotate.count_genes(vcr_clean, data) if gene_num is not None: info["variants_regions_info"]["genes"] = gene_num else: info["variants_regions_info"] = { "bed": "callable regions", } # Reporting in MultiQC only if the target is the same across samples if len(coverage_beds) == 1: cov_bed = list(coverage_beds)[0] if cov_bed not in [None, "None"]: if vcr_orig and vcr_orig == cov_bed: info["coverage_bed_info"] = info["variants_regions_info"] else: clean_bed = bedutils.clean_file(cov_bed, data, prefix="cov-", simple=True) info["coverage_bed_info"] = { "bed": cov_bed, "size": pybedtools.BedTool(cov_bed).total_coverage(), "regions": pybedtools.BedTool(clean_bed).count(), } gene_num = annotate.count_genes(clean_bed, data) if gene_num is not None: info["coverage_bed_info"]["genes"] = gene_num else: info["coverage_bed_info"] = info["variants_regions_info"] coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"] for data in samples) if len(coverage_intervals) == 1: info["coverage_interval"] = list(coverage_intervals)[0] if info: with open(out_file, "w") as out_handle: yaml.safe_dump(info, out_handle) return samples
def _merge_target_information(samples, metrics_dir): out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml")) if utils.file_exists(out_file): return samples genomes = set(dd.get_genome_build(data) for data in samples) coverage_beds = set(dd.get_coverage(data) for data in samples) original_variant_regions = set( dd.get_variant_regions_orig(data) for data in samples) data = samples[0] info = {} # Reporting in MultiQC only if the genome is the same across all samples if len(genomes) == 1: info["genome_info"] = { "name": dd.get_genome_build(data), "size": sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]), } # Reporting in MultiQC only if the target is the same across all samples vcr_orig = None if len(original_variant_regions) == 1 and list( original_variant_regions)[0] is not None: vcr_orig = list(original_variant_regions)[0] vcr_clean = bedutils.clean_file(vcr_orig, data) info["variants_regions_info"] = { "bed": vcr_orig, "size": sum( len(x) for x in pybedtools.BedTool( dd.get_variant_regions_merged(data))), "regions": pybedtools.BedTool(vcr_clean).count(), } gene_num = annotate.count_genes(vcr_clean, data) if gene_num is not None: info["variants_regions_info"]["genes"] = gene_num else: info["variants_regions_info"] = { "bed": "callable regions", } # Reporting in MultiQC only if the target is the same across samples if len(coverage_beds) == 1: cov_bed = list(coverage_beds)[0] if cov_bed not in [None, "None"]: if vcr_orig and vcr_orig == cov_bed: info["coverage_bed_info"] = info["variants_regions_info"] else: clean_bed = bedutils.clean_file(cov_bed, data, prefix="cov-", simple=True) info["coverage_bed_info"] = { "bed": cov_bed, "size": pybedtools.BedTool(cov_bed).total_coverage(), "regions": pybedtools.BedTool(clean_bed).count(), } gene_num = annotate.count_genes(clean_bed, data) if gene_num is not None: info["coverage_bed_info"]["genes"] = gene_num else: info["coverage_bed_info"] = info["variants_regions_info"] coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"] for data in samples) if len(coverage_intervals) == 1: info["coverage_interval"] = list(coverage_intervals)[0] if info: with open(out_file, "w") as out_handle: yaml.safe_dump(info, out_handle) return samples
def _merge_target_information(samples): out_file = os.path.join("metrics", "target_info.yaml") if utils.file_exists(out_file): return samples genomes = set(dd.get_genome_build(data) for data in samples) coverage_beds = set(dd.get_coverage(data) for data in samples) variant_regions = set(dd.get_variant_regions(data) for data in samples) data = samples[0] info = {} # Reporting in MultiQC only if the genome is the sample across samples if len(genomes) == 1: info["genome_info"] = { "name": dd.get_genome_build(data), "size": sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]), } # Reporting in MultiQC only if the target is the sample across samples vcr = None if len(variant_regions) == 1: vcr = dd.get_variant_regions_orig(data) vcr_merged = dd.get_variant_regions_merged(data) vcr_ann = annotate.add_genes(vcr, data) info["variants_regions_info"] = { "bed": variant_regions, "size": sum(len(x) for x in pybedtools.BedTool(vcr_merged)), "regions": pybedtools.BedTool(vcr).count(), "genes": len( list( set(r.name for r in pybedtools.BedTool(vcr_ann) if r.name and r.name != "."))), } elif len(variant_regions) == 0: info["variants_regions_info"] = {"bed": None} # Reporting in MultiQC only if the target is the sample across samples if len(coverage_beds) == 1: bed = dd.get_coverage(data) if vcr and vcr == bed: info["coverage_bed_info"] = info["variants_regions_info"] elif bed: ann_bed = annotate.add_genes(bed, data) info["coverage_bed_info"] = { "bed": bed, "size": pybedtools.BedTool(bed).total_coverage(), "regions": pybedtools.BedTool(bed).count(), "genes": len( list( set(r.name for r in pybedtools.BedTool(ann_bed) if r.name and r.name != "."))), } if info: with open(out_file, "w") as out_handle: yaml.safe_dump(info, out_handle) return samples