def get_sample_to_work_on(vcf_readers: List[VcfReader], requested_sample: Optional[str]): all_samples = set() sample_intersection = None for vcf_reader in vcf_readers: if sample_intersection is None: sample_intersection = set(vcf_reader.samples) else: sample_intersection.intersection_update(vcf_reader.samples) all_samples.update(vcf_reader.samples) assert sample_intersection is not None if requested_sample: sample_intersection.intersection_update([requested_sample]) if len(sample_intersection) == 0: raise CommandLineError( "Sample {!r} requested on command-line not found in all VCFs".format( requested_sample ) ) requested_sample = requested_sample else: if len(sample_intersection) == 0: raise CommandLineError("None of the samples is present in all VCFs") elif len(sample_intersection) == 1: requested_sample = list(sample_intersection)[0] else: raise CommandLineError( "More than one sample is present in all VCFs, please use" " --sample to specify which sample to work on." ) return requested_sample
def raise_if_any_sample_not_in_vcf(vcf_reader, samples): vcf_sample_set = set(vcf_reader.samples) for sample in samples: if sample not in vcf_sample_set: raise CommandLineError( "Sample {!r} requested on command-line not found in VCF". format(sample))
def run_hapcut2vcf(hapcut, vcf, output=sys.stdout): command_line = "(whatshap {}) {}".format(__version__, " ".join(sys.argv[1:])) with ExitStack() as stack: if isinstance(output, str): output = stack.enter_context(open(output, "w")) writer = PhasedVcfWriter(vcf, command_line, out_file=output) if len(writer.samples) > 1: # This would be easy to support with a --sample command-line parameter, # but hapCUT does not seem to support multi-sample VCFs, so something # must be wrong anyway. raise CommandLineError("There is more than one sample in this VCF") sample = writer.samples[0] f = stack.enter_context(open(hapcut)) parser = HapCutParser(f) for chromosome, blocks in parser: logger.info("Read %d phased blocks for chromosome %s", len(blocks), chromosome) # Build one read for each haplotype and the connected components haplotypes = [Read(str(i)) for i in (1, 2)] components = dict() for block in blocks: for variant in block: haplotypes[0].add_variant(variant.position, variant.haplotype1, 0) haplotypes[1].add_variant(variant.position, variant.haplotype2, 0) components[variant.position] = variant.component_id sample_superreads = {sample: haplotypes} sample_components = {sample: components} writer.write(chromosome, sample_superreads, sample_components)
def open_haplotag_writer(path): if path is None: path = os.devnull try: writer = xopen(path, "wt") except OSError as err: raise CommandLineError( "Error while initializing haplotag list output at path: {}\n{}".format(path, err) ) logger.debug("Writing header line to haplotag list output file") print("#readname", "haplotype", "phaseset", "chromosome", sep="\t", file=writer) return writer
def get_variant_tables( vcf_readers: List[VcfReader], vcf_filenames: List[str] ) -> List[Dict[str, VariantTable]]: vcfs = [] for reader, filename in zip(vcf_readers, vcf_filenames): # create dict mapping chromosome names to VariantTables m = dict() logger.info("Reading phasing from %r", filename) try: for variant_table in reader: m[variant_table.chromosome] = variant_table except PloidyError as e: raise CommandLineError("Provided ploidy is invalid: {}. Aborting.".format(e)) vcfs.append(m) return vcfs
def make_recombination_cost_computer( ped: Optional[str], genmap: Optional[str], recombrate: float) -> RecombinationCostComputer: if ped and genmap: logger.info( "Using region-specific recombination rates from genetic map %s.", genmap) try: return GeneticMapRecombinationCostComputer(genmap) except ParseError as e: raise CommandLineError(e) else: if ped: logger.info("Using uniform recombination rate of %g cM/Mb.", recombrate) return UniformRecombinationCostComputer(recombrate)
def open_output_alignment_file(aln_output, reference, vcf_md5, bam_header): """ :param aln_output: :param reference: :param vcf_md5: :param bam_header: :param exit_stack: :return: """ # Prepare header # TODO: convince pysam to allow @HS header line command_line = " ".join(["whatshap"] + sys.argv[1:]) PG_entry = { "ID": "whatshap", "PN": "whatshap", "VN": __version__, "CL": command_line, "m5": vcf_md5, } if "PG" in bam_header: bam_header["PG"].append(PG_entry) else: bam_header["PG"] = [PG_entry] if aln_output is None: aln_output = "-" kwargs = dict() elif str(aln_output).endswith(".cram"): # FIXME hard-coded value if reference is None: raise ValueError( 'Writing CRAM output requires FASTA reference file given via "--reference"' ) kwargs = dict(mode="wc", reference_filename=reference) else: # Write BAM kwargs = dict(mode="wb") try: bam_writer = pysam.AlignmentFile( aln_output, header=pysam.AlignmentHeader.from_dict(bam_header), **kwargs) except OSError as err: raise CommandLineError( "Error while initializing alignment output file at path: {}\n{}". format(aln_output, err)) return bam_writer
def run_whatshap( phase_input_files: List[str], variant_file: str, reference: Union[None, bool, str] = False, output: TextIO = sys.stdout, samples: List[str] = None, chromosomes: Optional[List[str]] = None, ignore_read_groups: bool = False, indels: bool = True, mapping_quality: int = 20, read_merging: bool = False, read_merging_error_rate: float = 0.15, read_merging_max_error_rate: float = 0.25, read_merging_positive_threshold: int = 1000000, read_merging_negative_threshold: int = 1000, max_coverage: int = 15, distrust_genotypes: bool = False, include_homozygous: bool = False, ped: Optional[str] = None, recombrate: float = 1.26, genmap: Optional[str] = None, genetic_haplotyping: bool = True, recombination_list_filename: Optional[str] = None, tag: str = "PS", read_list_filename: Optional[str] = None, gl_regularizer: Optional[float] = None, gtchange_list_filename: Optional[str] = None, default_gq: int = 30, write_command_line_header: bool = True, use_ped_samples: bool = False, algorithm: str = "whatshap", ): """ Run WhatsHap. phase_input_files -- list of paths to BAM/CRAM/VCF files variant_file -- path to input VCF reference -- path to reference FASTA. If False: skip realignment. If None: complain if reference needed. output -- path to output VCF or a file-like object samples -- names of samples to phase. an empty list means: phase all samples chromosomes -- names of chromosomes to phase. an empty list means: phase all chromosomes ignore_read_groups mapping_quality -- discard reads below this mapping quality read_merging -- whether or not to merge reads read_merging_error_rate -- probability that a nucleotide is wrong read_merging_max_error_rate -- max error rate on edge of merge graph considered read_merging_positive_threshold -- threshold on the ratio of the two probabilities read_merging_negative_threshold -- threshold on the opposite ratio of positive threshold max_coverage distrust_genotypes include_homozygous genetic_haplotyping -- in ped mode, merge disconnected blocks based on genotype status recombination_list_filename -- filename to write putative recombination events to tag -- How to store phasing info in the VCF, can be 'PS' or 'HP' read_list_filename -- name of file to write list of used reads to algorithm -- algorithm to use, can be 'whatshap' or 'hapchat' gl_regularizer -- float to be passed as regularization constant to GenotypeLikelihoods.as_phred gtchange_list_filename -- filename to write list of changed genotypes to default_gq -- genotype likelihood to be used when GL or PL not available write_command_line_header -- whether to add a ##commandline header to the output VCF """ if algorithm == "hapchat" and ped is not None: raise CommandLineError( "The hapchat algorithm cannot do pedigree phasing") timers = StageTimer() logger.info( f"This is WhatsHap {__version__} running under Python {platform.python_version()}" ) numeric_sample_ids = NumericSampleIds() command_line: Optional[str] if write_command_line_header: command_line = "(whatshap {}) {}".format(__version__, " ".join(sys.argv[1:])) else: command_line = None read_merger: ReadMergerBase if read_merging: read_merger = ReadMerger( read_merging_error_rate, read_merging_max_error_rate, read_merging_positive_threshold, read_merging_negative_threshold, ) else: read_merger = DoNothingReadMerger() with ExitStack() as stack: try: vcf_writer = stack.enter_context( PhasedVcfWriter( command_line=command_line, in_path=variant_file, out_file=output, tag=tag, indels=indels, )) except (OSError, VcfError) as e: raise CommandLineError(e) phased_input_reader = stack.enter_context( PhasedInputReader( phase_input_files, None if reference is False else reference, numeric_sample_ids, ignore_read_groups, mapq_threshold=mapping_quality, indels=indels, )) show_phase_vcfs = phased_input_reader.has_vcfs if phased_input_reader.has_alignments and reference is None: raise CommandLineError( "A reference FASTA needs to be provided with -r/--reference; " "or use --no-reference at the expense of phasing quality.") # Only read genotype likelihoods from VCFs when distrusting genotypes vcf_reader = stack.enter_context( VcfReader(variant_file, indels=indels, genotype_likelihoods=distrust_genotypes)) if ignore_read_groups and not samples and len(vcf_reader.samples) > 1: raise CommandLineError( "When using --ignore-read-groups on a VCF with " "multiple samples, --sample must also be used.") if not samples: samples = vcf_reader.samples # if --use-ped-samples is set, use only samples from PED file if ped and use_ped_samples: samples = PedReader(ped).samples() raise_if_any_sample_not_in_vcf(vcf_reader, samples) recombination_cost_computer = make_recombination_cost_computer( ped, genmap, recombrate) families, family_trios = setup_families(samples, ped, max_coverage) del samples for trios in family_trios.values(): for trio in trios: # Ensure that all mentioned individuals have a numeric id _ = numeric_sample_ids[trio.child] read_list = None if read_list_filename: read_list = stack.enter_context(ReadList(read_list_filename)) if algorithm == "hapchat": logger.warning( "On which haplotype a read occurs in the inferred solution is not yet " "implemented in hapchat, and so the corresponding column in the " "read list file contains no information about this") with timers("parse_phasing_vcfs"): # TODO should this be done in PhasedInputReader.__init__? phased_input_reader.read_vcfs() superreads: Dict[str, ReadSet] components: Dict for variant_table in timers.iterate("parse_vcf", vcf_reader): chromosome = variant_table.chromosome if (not chromosomes) or (chromosome in chromosomes): logger.info("======== Working on chromosome %r", chromosome) else: logger.info( "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)", chromosome, ) with timers("write_vcf"): superreads, components = dict(), dict() vcf_writer.write(chromosome, superreads, components) continue # These two variables hold the phasing results for all samples superreads, components = dict(), dict() # Iterate over all families to process, i.e. a separate DP table is created # for each family. # TODO: Can the body of this loop be factored out into a phase_family function? for representative_sample, family in sorted(families.items()): if len(family) == 1: logger.info("---- Processing individual %s", representative_sample) else: logger.info("---- Processing family with individuals: %s", ",".join(family)) max_coverage_per_sample = max(1, max_coverage // len(family)) logger.info("Using maximum coverage per sample of %dX", max_coverage_per_sample) trios = family_trios[representative_sample] assert len(family) == 1 or len(trios) > 0 homozygous_positions, phasable_variant_table = find_phaseable_variants( family, include_homozygous, trios, variant_table) # Get the reads belonging to each sample readsets = dict() # TODO this could become a list for sample in family: with timers("read_bam"): readset, vcf_source_ids = phased_input_reader.read( chromosome, phasable_variant_table.variants, sample) # TODO: Read selection done w.r.t. all variants, where using heterozygous # variants only would probably give better results. with timers("select"): readset = readset.subset([ i for i, read in enumerate(readset) if len(read) >= 2 ]) logger.info( "Kept %d reads that cover at least two variants each", len(readset)) merged_reads = read_merger.merge(readset) selected_reads = select_reads( merged_reads, max_coverage_per_sample, preferred_source_ids=vcf_source_ids, ) readsets[sample] = selected_reads if len(family) == 1 and not distrust_genotypes: # When having a pedigree (len(family) > 1), blocks are also merged after # phasing based on the pedigree information and these statistics are not # so useful. When distrust_genotypes, genotypes can change during phasing # and so can the block structure. So don't print these stats in those cases log_best_case_phasing_info(readset, selected_reads) all_reads = merge_readsets(readsets) # Determine which variants can (in principle) be phased accessible_positions = sorted(all_reads.get_positions()) logger.info( "Variants covered by at least one phase-informative " "read in at least one individual after read selection: %d", len(accessible_positions), ) if len(family) > 1 and genetic_haplotyping: # In case of genetic haplotyping, also retain all positions homozygous # in at least one individual (because they might be phased based on genotypes) accessible_positions = sorted( set(accessible_positions).union(homozygous_positions)) logger.info( "Variants either covered by phase-informative read or homozygous " "in at least one individual: %d", len(accessible_positions), ) # Keep only accessible positions phasable_variant_table.subset_rows_by_position( accessible_positions) assert len(phasable_variant_table.variants) == len( accessible_positions) pedigree = create_pedigree( default_gq, distrust_genotypes, family, gl_regularizer, numeric_sample_ids, phasable_variant_table, trios, ) recombination_costs = recombination_cost_computer.compute( accessible_positions) # Finally, run phasing algorithm with timers("phase"): problem_name = "MEC" if len(family) == 1 else "PedMEC" logger.info( "Phasing %d sample%s by solving the %s problem ...", len(family), plural_s(len(family)), problem_name, ) dp_table: Union[HapChatCore, PedigreeDPTable] if algorithm == "hapchat": dp_table = HapChatCore(all_reads) else: dp_table = PedigreeDPTable( all_reads, recombination_costs, pedigree, distrust_genotypes, accessible_positions, ) superreads_list, transmission_vector = dp_table.get_super_reads( ) logger.info("%s cost: %d", problem_name, dp_table.get_optimal_cost()) with timers("components"): overall_components = compute_overall_components( accessible_positions, all_reads, distrust_genotypes, family, genetic_haplotyping, homozygous_positions, numeric_sample_ids, superreads_list, ) log_component_stats(overall_components, len(accessible_positions)) if recombination_list_filename: n_recombinations = write_recombination_list( recombination_list_filename, chromosome, accessible_positions, overall_components, recombination_costs, transmission_vector, trios, ) logger.info( "Total no. of detected recombination events: %d", n_recombinations) # Superreads in superreads_list are in the same order as individuals were added to the pedigree for sample, sample_superreads in zip(family, superreads_list): superreads[sample] = sample_superreads assert len(sample_superreads) == 2 assert (sample_superreads[0].sample_id == sample_superreads[1].sample_id == numeric_sample_ids[sample]) # identical for all samples components[sample] = overall_components if read_list: read_list.write( all_reads, dp_table.get_optimal_partitioning(), components, numeric_sample_ids, ) with timers("write_vcf"): logger.info("======== Writing VCF") changed_genotypes = vcf_writer.write(chromosome, superreads, components) logger.info("Done writing VCF") if changed_genotypes: assert distrust_genotypes logger.info("Changed %d genotypes while writing VCF", len(changed_genotypes)) if gtchange_list_filename: logger.info("Writing list of changed genotypes to %r", gtchange_list_filename) write_changed_genotypes(gtchange_list_filename, changed_genotypes) logger.debug("Chromosome %r finished", chromosome) log_time_and_memory_usage(timers, show_phase_vcfs=show_phase_vcfs)
def run_polyphase( phase_input_files, variant_file, ploidy, reference=None, output=sys.stdout, samples=None, chromosomes=None, verify_genotypes=False, ignore_read_groups=False, indels=True, mapping_quality=20, tag="PS", include_haploid_sets=False, write_command_line_header=True, read_list_filename=None, ce_bundle_edges=False, min_overlap=2, plot_clusters=False, plot_threading=False, ce_refinements=5, block_cut_sensitivity=4, ): """ Run Polyploid Phasing. phase_input_files -- list of paths to BAM/CRAM/VCF files variant-file -- path to input VCF reference -- path to reference FASTA output -- path to output VCF or a file like object samples -- names of samples to phase. An empty list means: phase all samples chromosomes -- names of chromosomes to phase. An empty list means: phase all chromosomes ignore_read_groups mapping_quality -- discard reads below this mapping quality tag -- How to store phasing info in the VCF, can be 'PS' or 'HP' write_command_line_header -- whether to add a ##commandline header to the output VCF """ timers = StageTimer() logger.info( "This is WhatsHap (polyploid) %s running under Python %s", __version__, platform.python_version(), ) numeric_sample_ids = NumericSampleIds() with ExitStack() as stack: assert phase_input_files phased_input_reader = stack.enter_context( PhasedInputReader( phase_input_files, reference, numeric_sample_ids, ignore_read_groups, indels=indels, mapq_threshold=mapping_quality, )) assert not phased_input_reader.has_vcfs if write_command_line_header: command_line = "(whatshap {}) {}".format(__version__, " ".join(sys.argv[1:])) else: command_line = None try: vcf_writer = stack.enter_context( PhasedVcfWriter( command_line=command_line, in_path=variant_file, out_file=output, tag=tag, ploidy=ploidy, include_haploid_sets=include_haploid_sets, )) except OSError as e: raise CommandLineError(e) vcf_reader = stack.enter_context( VcfReader( variant_file, indels=indels, phases=True, genotype_likelihoods=False, ploidy=ploidy, )) if ignore_read_groups and not samples and len(vcf_reader.samples) > 1: raise CommandLineError( "When using --ignore-read-groups on a VCF with " "multiple samples, --sample must also be used.") if not samples: samples = vcf_reader.samples vcf_sample_set = set(vcf_reader.samples) for sample in samples: if sample not in vcf_sample_set: raise CommandLineError( "Sample {!r} requested on command-line not found in VCF". format(sample)) if block_cut_sensitivity < 0: logger.warning( "Block cut sensitivity was set to negative value. Lowest value (0) is assumed instead." ) block_cut_sensitivity = 0 elif block_cut_sensitivity > 5: logger.warning( "Block cut sensitivity level too large. Assuming highest valid value (5) instead." ) block_cut_sensitivity = 5 samples = frozenset(samples) read_list_file = None if read_list_filename: raise NotImplementedError("create_read_list_file not implemented") # read_list_file = create_read_list_file(read_list_filename) # Store phasing parameters in tuple to keep function signatures cleaner phasing_param = PhasingParameter( ploidy=ploidy, verify_genotypes=verify_genotypes, ce_bundle_edges=ce_bundle_edges, min_overlap=min_overlap, ce_refinements=ce_refinements, block_cut_sensitivity=block_cut_sensitivity, plot_clusters=plot_clusters, plot_threading=plot_threading, ) timers.start("parse_vcf") try: for variant_table in vcf_reader: chromosome = variant_table.chromosome timers.stop("parse_vcf") if (not chromosomes) or (chromosome in chromosomes): logger.info("======== Working on chromosome %r", chromosome) else: logger.info( "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)", chromosome, ) with timers("write_vcf"): superreads, components = dict(), dict() vcf_writer.write(chromosome, superreads, components) continue # These two variables hold the phasing results for all samples superreads, components, haploid_components = dict(), dict( ), dict() # Iterate over all samples to process for sample in samples: logger.info("---- Processing individual %s", sample) # Process inputs for this sample missing_genotypes = set() heterozygous = set() genotypes = variant_table.genotypes_of(sample) for index, gt in enumerate(genotypes): if gt.is_none(): missing_genotypes.add(index) elif not gt.is_homozygous(): heterozygous.add(index) else: assert gt.is_homozygous() to_discard = set(range( len(variant_table))).difference(heterozygous) phasable_variant_table = deepcopy(variant_table) # Remove calls to be discarded from variant table phasable_variant_table.remove_rows_by_index(to_discard) logger.info( "Number of variants skipped due to missing genotypes: %d", len(missing_genotypes), ) logger.info( "Number of remaining heterozygous variants: %d", len(phasable_variant_table)) # Get the reads belonging to this sample timers.start("read_bam") readset, vcf_source_ids = phased_input_reader.read( chromosome, phasable_variant_table.variants, sample) readset.sort() timers.stop("read_bam") # Verify genotypes if verify_genotypes: timers.start("verify_genotypes") logger.info("Verify genotyping of %s", sample) positions = [ v.position for v in phasable_variant_table.variants ] computed_genotypes = [ Genotype(gt) for gt in compute_polyploid_genotypes( readset, ploidy, positions) ] # skip all positions at which genotypes do not match given_genotypes = phasable_variant_table.genotypes_of( sample) matching_genotypes = [] missing_genotypes = set() print(computed_genotypes, len(computed_genotypes)) print(given_genotypes, len(given_genotypes)) print(len(positions)) for i, g in enumerate(given_genotypes): c_g = computed_genotypes[i] if (g == c_g) or (c_g is None): matching_genotypes.append(g) else: matching_genotypes.append(Genotype([])) missing_genotypes.add(i) phasable_variant_table.set_genotypes_of( sample, matching_genotypes) # Remove variants with deleted genotype phasable_variant_table.remove_rows_by_index( missing_genotypes) logger.info( "Number of variants removed due to inconsistent genotypes: %d", len(missing_genotypes), ) logger.info( "Number of remaining heterozygous variants: %d", len(phasable_variant_table), ) # Re-read the readset to remove discarded variants readset, vcf_source_ids = phased_input_reader.read( chromosome, phasable_variant_table.variants, sample) readset.sort() timers.stop("verify_genotypes") # Remove reads with insufficient variants readset = readset.subset([ i for i, read in enumerate(readset) if len(read) >= max(2, min_overlap) ]) logger.info( "Kept %d reads that cover at least two variants each", len(readset)) # Adapt the variant table to the subset of reads phasable_variant_table.subset_rows_by_position( readset.get_positions()) # Run the actual phasing ( sample_components, sample_haploid_components, sample_superreads, ) = phase_single_individual(readset, phasable_variant_table, sample, phasing_param, output, timers) # Collect results components[sample] = sample_components haploid_components[sample] = sample_haploid_components superreads[sample] = sample_superreads with timers("write_vcf"): logger.info("======== Writing VCF") vcf_writer.write( chromosome, superreads, components, haploid_components if include_haploid_sets else None, ) # TODO: Use genotype information to polish results # assert len(changed_genotypes) == 0 logger.info("Done writing VCF") logger.debug("Chromosome %r finished", chromosome) timers.start("parse_vcf") timers.stop("parse_vcf") except PloidyError as e: raise CommandLineError(e) if read_list_file: read_list_file.close() logger.info("\n== SUMMARY ==") log_memory_usage() logger.info("Time spent reading BAM/CRAM: %6.1f s", timers.elapsed("read_bam")) logger.info("Time spent parsing VCF: %6.1f s", timers.elapsed("parse_vcf")) if verify_genotypes: logger.info( "Time spent verifying genotypes: %6.1f s", timers.elapsed("verify_genotypes"), ) logger.info("Time spent detecting blocks: %6.1f s", timers.elapsed("detecting_blocks")) logger.info("Time spent scoring reads: %6.1f s", timers.elapsed("read_scoring")) logger.info( "Time spent solving cluster editing: %6.1f s", timers.elapsed("solve_clusterediting"), ) logger.info("Time spent threading haplotypes: %6.1f s", timers.elapsed("threading")) if plot_clusters or plot_threading: logger.info("Time spent creating plots: %6.1f s", timers.elapsed("create_plots")) logger.info("Time spent writing VCF: %6.1f s", timers.elapsed("write_vcf")) logger.info("Time spent on rest: %6.1f s", timers.total() - timers.sum()) logger.info("Total elapsed time: %6.1f s", timers.total())
def run_compare( vcf, ploidy, names=None, sample=None, tsv_pairwise=None, tsv_multiway=None, only_snvs=False, switch_error_bed=None, plot_blocksizes=None, plot_sum_of_blocksizes=None, longest_block_tsv=None, ): vcf_readers = [VcfReader(f, indels=not only_snvs, phases=True, ploidy=ploidy) for f in vcf] if names: dataset_names = names.split(",") if len(dataset_names) != len(vcf): raise CommandLineError( "Number of names given with --names does not equal number of VCFs." ) else: dataset_names = ["file{}".format(i) for i in range(len(vcf))] longest_name = max(len(n) for n in dataset_names) sample = get_sample_to_work_on(vcf_readers, requested_sample=sample) with ExitStack() as stack: tsv_pairwise_file = tsv_multiway_file = longest_block_tsv_file = switch_error_bedfile = None if tsv_pairwise: tsv_pairwise_file = stack.enter_context(open(tsv_pairwise, "w")) if tsv_multiway: tsv_multiway_file = stack.enter_context(open(tsv_multiway, "w")) print( "#sample", "chromosome", "dataset_list0", "dataset_list1", "count", sep="\t", file=tsv_multiway_file, ) if longest_block_tsv: longest_block_tsv_file = stack.enter_context(open(longest_block_tsv, "w")) print( "#dataset_name0", "dataset_name1", "#sample", "chromosome", "position", "phase_agreeing", sep="\t", file=longest_block_tsv_file, ) print("Comparing phasings for sample", sample) vcfs = get_variant_tables(vcf_readers, vcf) chromosomes = get_common_chromosomes(vcfs) if len(chromosomes) == 0: raise CommandLineError("No chromosome is contained in all VCFs. Aborting.") logger.info("Chromosomes present in all VCFs: %s", ", ".join(chromosomes)) if tsv_pairwise_file: fields = [ "#sample", "chromosome", "dataset_name0", "dataset_name1", "file_name0", "file_name1", ] field_names = [f.name for f in dataclasses.fields(PairwiseComparisonResults)] fields.extend(field_names) fields.extend(["het_variants0", "only_snvs"]) print(*fields, sep="\t", file=tsv_pairwise_file) if switch_error_bed: switch_error_bedfile = stack.enter_context(open(switch_error_bed, "w")) print("FILENAMES") for name, filename in zip(dataset_names, vcf): print(name.rjust(longest_name + 2), "=", filename) width = max(longest_name, 15) + 5 all_block_stats = [[] for _ in vcfs] def add_block_stats(block_stats): assert len(block_stats) == len(all_block_stats) for big_list, new_list in zip(all_block_stats, block_stats): big_list.extend(new_list) for chromosome in sorted(chromosomes): print("---------------- Chromosome {} ----------------".format(chromosome)) all_bed_records = [] variant_tables = [vcf[chromosome] for vcf in vcfs] all_variants_union = set() all_variants_intersection = None het_variants_union = set() het_variants_intersection = None het_variant_sets = [] het_variants0 = None print("VARIANT COUNTS (heterozygous / all): ") for variant_table, name in zip(variant_tables, dataset_names): all_variants_union.update(variant_table.variants) het_variants = [ v for v, gt in zip(variant_table.variants, variant_table.genotypes_of(sample)) if not gt.is_homozygous() ] if het_variants0 is None: het_variants0 = len(het_variants) het_variants_union.update(het_variants) if all_variants_intersection is None: all_variants_intersection = set(variant_table.variants) het_variants_intersection = set(het_variants) else: all_variants_intersection.intersection_update(variant_table.variants) het_variants_intersection.intersection_update(het_variants) het_variant_sets.append(set(het_variants)) print( "{}:".format(name).rjust(width), str(len(het_variants)).rjust(count_width), "/", str(len(variant_table.variants)).rjust(count_width), ) print( "UNION:".rjust(width), str(len(het_variants_union)).rjust(count_width), "/", str(len(all_variants_union)).rjust(count_width), ) print( "INTERSECTION:".rjust(width), str(len(het_variants_intersection)).rjust(count_width), "/", str(len(all_variants_intersection)).rjust(count_width), ) for i in range(len(vcfs)): for j in range(i + 1, len(vcfs)): print( "PAIRWISE COMPARISON: {} <--> {}:".format( dataset_names[i], dataset_names[j] ) ) ( results, bed_records, block_stats, longest_block_positions, longest_block_agreement, multiway_results, ) = compare( [variant_tables[i], variant_tables[j]], sample, [dataset_names[i], dataset_names[j]], ploidy, ) if len(vcfs) == 2: add_block_stats(block_stats) all_bed_records.extend(bed_records) if tsv_pairwise_file: fields = [ sample, chromosome, dataset_names[i], dataset_names[j], vcf[i], vcf[j], ] fields.extend(dataclasses.astuple(results)) fields.extend([het_variants0, int(only_snvs)]) print(*fields, sep="\t", file=tsv_pairwise_file) if longest_block_tsv_file: assert ploidy == 2 assert len(longest_block_positions) == len(longest_block_agreement) for position, phase_agreeing in zip( longest_block_positions, longest_block_agreement ): print( dataset_names[i], dataset_names[j], sample, chromosome, position, phase_agreeing, sep="\t", file=longest_block_tsv_file, ) # if requested, write all switch errors found in the current chromosome to the bed file if switch_error_bedfile: assert ploidy == 2 all_bed_records.sort() for record in all_bed_records: print(*record, sep="\t", file=switch_error_bedfile) if len(vcfs) > 2: assert ploidy == 2 print("MULTIWAY COMPARISON OF ALL PHASINGS:") ( results, bed_records, block_stats, longest_block_positions, longest_block_agreement, multiway_results, ) = compare(variant_tables, sample, dataset_names, ploidy) add_block_stats(block_stats) if tsv_multiway_file: for ((dataset_list0, dataset_list1), count,) in multiway_results.items(): print( sample, chromosome, "{" + dataset_list0 + "}", "{" + dataset_list1 + "}", count, sep="\t", file=tsv_multiway_file, ) if plot_blocksizes: create_blocksize_histogram(plot_blocksizes, all_block_stats, dataset_names) if plot_sum_of_blocksizes: create_blocksize_histogram( plot_sum_of_blocksizes, all_block_stats, dataset_names, use_weights=True )
def create_blocksize_histogram(filename, block_stats, names, use_weights=False): try: import matplotlib import numpy matplotlib.use("pdf") from matplotlib import pyplot from matplotlib.backends.backend_pdf import PdfPages except ImportError: raise CommandLineError( "To use option --plot-blocksizes, you need to have numpy and matplotlib installed." ) assert len(block_stats) == len(names) color_list = ["#ffa347", "#0064c8", "#b42222", "#22a5b4", "#b47c22", "#6db6ff"] if len(color_list) < len(block_stats): color_count = len(block_stats) color_list = pyplot.cm.Set1([n / color_count for n in range(color_count)]) colors = color_list[: len(block_stats)] with PdfPages(filename) as pdf: for what, xlabel in [ (lambda stats: stats.variant_count, "variant count"), (lambda stats: stats.span, "span [bp]"), ]: pyplot.figure(figsize=(10, 8)) max_value = max(what(stats) for stats in chain(*block_stats)) common_bins = numpy.logspace(0, math.ceil(math.log10(max_value)), 50) for l, name, color in zip(block_stats, names, colors): x = [what(stats) for stats in l] n, bins, patches = pyplot.hist( x, bins=common_bins, alpha=0.6, color=color, label=name, weights=x if use_weights else None, ) pyplot.xlabel(xlabel) pyplot.ylabel("Number of blocks") pyplot.gca().set_xscale("log") pyplot.gca().set_yscale("log") pyplot.grid(True) pyplot.legend() pdf.savefig() pyplot.close() pyplot.figure(figsize=(10, 8)) common_bins = numpy.logspace(0, math.ceil(math.log10(max_value)), 25) x = [[what(stats) for stats in l] for l in block_stats] n, bins, patches = pyplot.hist( x, bins=common_bins, alpha=0.6, color=colors, label=names, weights=x if use_weights else None, ) pyplot.xlabel(xlabel) pyplot.ylabel("Number of blocks") pyplot.gca().set_xscale("log") pyplot.gca().set_yscale("log") pyplot.grid(True) pyplot.legend() pdf.savefig() pyplot.close()
def run_genotype( phase_input_files, variant_file, reference=None, output=sys.stdout, samples=None, chromosomes=None, ignore_read_groups=False, indels=True, mapping_quality=20, max_coverage=15, nopriors=False, ped=None, recombrate=1.26, genmap=None, gt_qual_threshold=0, prioroutput=None, constant=0.0, overhang=10, affine_gap=False, gap_start=10, gap_extend=7, mismatch=15, write_command_line_header=True, use_ped_samples=False, ): """ For now: this function only runs the genotyping algorithm. Genotype likelihoods for all variants are computed using the forward backward algorithm """ timers = StageTimer() logger.info( "This is WhatsHap (genotyping) %s running under Python %s", __version__, platform.python_version(), ) if write_command_line_header: command_line = "(whatshap {}) {}".format(__version__, " ".join(sys.argv[1:])) else: command_line = None with ExitStack() as stack: # read the given input files (BAMs, VCFs, ref...) numeric_sample_ids = NumericSampleIds() phased_input_reader = stack.enter_context( PhasedInputReader( phase_input_files, reference, numeric_sample_ids, ignore_read_groups, indels=indels, mapq_threshold=mapping_quality, overhang=overhang, affine=affine_gap, gap_start=gap_start, gap_extend=gap_extend, default_mismatch=mismatch, )) show_phase_vcfs = phased_input_reader.has_vcfs # vcf writer for final genotype likelihoods vcf_writer = stack.enter_context( GenotypeVcfWriter(command_line=command_line, in_path=variant_file, out_file=output)) # vcf writer for only the prior likelihoods (if output is desired) prior_vcf_writer = None if prioroutput is not None: prior_vcf_writer = stack.enter_context( GenotypeVcfWriter( command_line=command_line, in_path=variant_file, out_file=stack.enter_context(open(prioroutput, "w")), )) # parse vcf with input variants # remove all likelihoods that may already be present vcf_reader = stack.enter_context( VcfReader( variant_file, indels=indels, genotype_likelihoods=False, ignore_genotypes=True, )) if ignore_read_groups and not samples and len(vcf_reader.samples) > 1: raise CommandLineError( "When using --ignore-read-groups on a VCF with " "multiple samples, --sample must also be used.") if not samples: samples = vcf_reader.samples # if --use-ped-samples is set, use only samples from PED file if ped and use_ped_samples: samples = set() for trio in PedReader(ped): if trio.child is None or trio.mother is None or trio.father is None: continue samples.add(trio.mother) samples.add(trio.father) samples.add(trio.child) vcf_sample_set = set(vcf_reader.samples) for sample in samples: if sample not in vcf_sample_set: raise CommandLineError( "Sample {!r} requested on command-line not found in VCF". format(sample)) if ped and genmap: logger.info( "Using region-specific recombination rates from genetic map %s.", genmap, ) recombination_cost_computer = GeneticMapRecombinationCostComputer( genmap) else: if ped: logger.info("Using uniform recombination rate of %g cM/Mb.", recombrate) recombination_cost_computer = UniformRecombinationCostComputer( recombrate) samples = frozenset(samples) families, family_trios = setup_families(samples, ped, numeric_sample_ids, max_coverage) # Read phase information provided as VCF files, if provided. with timers("parse_phasing_vcfs"): phased_input_reader.read_vcfs() # compute genotype likelihood threshold gt_prob = 1.0 - (10**(-gt_qual_threshold / 10.0)) for variant_table in timers.iterate("parse_vcf", vcf_reader): # create a mapping of genome positions to indices var_to_pos = dict() for i in range(len(variant_table.variants)): var_to_pos[variant_table.variants[i].position] = i chromosome = variant_table.chromosome if (not chromosomes) or (chromosome in chromosomes): logger.info("======== Working on chromosome %r", chromosome) else: logger.info( "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)", chromosome, ) vcf_writer.write_genotypes(chromosome, variant_table, indels, leave_unchanged=True) if prioroutput is not None: prior_vcf_writer.write_genotypes(chromosome, variant_table, indels, leave_unchanged=True) continue positions = [v.position for v in variant_table.variants] if not nopriors: # compute prior genotype likelihoods based on all reads for sample in samples: logger.info("---- Initial genotyping of %s", sample) with timers("read_bam"): readset, vcf_source_ids = phased_input_reader.read( chromosome, variant_table.variants, sample, read_vcf=False, ) readset.sort() genotypes, genotype_likelihoods = compute_genotypes( readset, positions) # recompute genotypes based on given threshold reg_genotype_likelihoods = [] for gl in range(len(genotype_likelihoods)): norm_sum = (genotype_likelihoods[gl][0] + genotype_likelihoods[gl][1] + genotype_likelihoods[gl][2] + 3 * constant) regularized = PhredGenotypeLikelihoods([ (genotype_likelihoods[gl][0] + constant) / norm_sum, (genotype_likelihoods[gl][1] + constant) / norm_sum, (genotype_likelihoods[gl][2] + constant) / norm_sum, ]) genotypes[gl] = determine_genotype( regularized, gt_prob) assert isinstance(genotypes[gl], Genotype) reg_genotype_likelihoods.append(regularized) variant_table.set_genotype_likelihoods_of( sample, [ PhredGenotypeLikelihoods(list(gl)) for gl in reg_genotype_likelihoods ], ) variant_table.set_genotypes_of(sample, genotypes) else: # use uniform genotype likelihoods for all individuals for sample in samples: variant_table.set_genotype_likelihoods_of( sample, [PhredGenotypeLikelihoods([1 / 3, 1 / 3, 1 / 3])] * len(positions), ) # if desired, output the priors in separate vcf if prioroutput is not None: prior_vcf_writer.write_genotypes(chromosome, variant_table, indels) # Iterate over all families to process, i.e. a separate DP table is created # for each family. for representative_sample, family in sorted(families.items()): if len(family) == 1: logger.info("---- Processing individual %s", representative_sample) else: logger.info("---- Processing family with individuals: %s", ",".join(family)) max_coverage_per_sample = max(1, max_coverage // len(family)) logger.info("Using maximum coverage per sample of %dX", max_coverage_per_sample) trios = family_trios[representative_sample] assert (len(family) == 1) or (len(trios) > 0) # Get the reads belonging to each sample readsets = dict() for sample in family: with timers("read_bam"): readset, vcf_source_ids = phased_input_reader.read( chromosome, variant_table.variants, sample, ) with timers("select"): readset = readset.subset([ i for i, read in enumerate(readset) if len(read) >= 2 ]) logger.info( "Kept %d reads that cover at least two variants each", len(readset), ) selected_reads = select_reads( readset, max_coverage_per_sample, preferred_source_ids=vcf_source_ids, ) readsets[sample] = selected_reads # Merge reads into one ReadSet (note that each Read object # knows the sample it originated from). all_reads = ReadSet() for sample, readset in readsets.items(): for read in readset: assert read.is_sorted(), "Add a read.sort() here" all_reads.add(read) all_reads.sort() # Determine which variants can (in principle) be phased accessible_positions = sorted(all_reads.get_positions()) logger.info( "Variants covered by at least one phase-informative " "read in at least one individual after read selection: %d", len(accessible_positions), ) # Create Pedigree pedigree = Pedigree(numeric_sample_ids) for sample in family: # genotypes are assumed to be unknown, so ignore information that # might already be present in the input vcf all_genotype_likelihoods = variant_table.genotype_likelihoods_of( sample) genotype_l = [ all_genotype_likelihoods[var_to_pos[a_p]] for a_p in accessible_positions ] pedigree.add_individual( sample, [ Genotype([]) for i in range(len(accessible_positions)) ], genotype_l, ) for trio in trios: pedigree.add_relationship( father_id=trio.father, mother_id=trio.mother, child_id=trio.child, ) recombination_costs = recombination_cost_computer.compute( accessible_positions) # Finally, run genotyping algorithm with timers("genotyping"): problem_name = "genotyping" logger.info( "Genotype %d sample%s by solving the %s problem ...", len(family), "s" if len(family) > 1 else "", problem_name, ) forward_backward_table = GenotypeDPTable( numeric_sample_ids, all_reads, recombination_costs, pedigree, accessible_positions, ) # store results for s in family: likelihood_list = variant_table.genotype_likelihoods_of( s) genotypes_list = variant_table.genotypes_of(s) for pos in range(len(accessible_positions)): likelihoods = forward_backward_table.get_genotype_likelihoods( s, pos) # compute genotypes from likelihoods and store information geno = determine_genotype(likelihoods, gt_prob) assert isinstance(geno, Genotype) genotypes_list[var_to_pos[ accessible_positions[pos]]] = geno likelihood_list[var_to_pos[ accessible_positions[pos]]] = likelihoods variant_table.set_genotypes_of(s, genotypes_list) variant_table.set_genotype_likelihoods_of( s, likelihood_list) with timers("write_vcf"): logger.info("======== Writing VCF") vcf_writer.write_genotypes(chromosome, variant_table, indels) logger.info("Done writing VCF") logger.debug("Chromosome %r finished", chromosome) logger.info("\n== SUMMARY ==") total_time = timers.total() log_memory_usage() logger.info( "Time spent reading BAM: %6.1f s", timers.elapsed("read_bam"), ) logger.info( "Time spent parsing VCF: %6.1f s", timers.elapsed("parse_vcf"), ) if show_phase_vcfs: logger.info( "Time spent parsing input phasings from VCFs: %6.1f s", timers.elapsed("parse_phasing_vcfs"), ) logger.info("Time spent selecting reads: %6.1f s", timers.elapsed("select")) logger.info( "Time spent genotyping: %6.1f s", timers.elapsed("genotyping"), ) logger.info( "Time spent writing VCF: %6.1f s", timers.elapsed("write_vcf"), ) logger.info( "Time spent on rest: %6.1f s", total_time - timers.sum(), ) logger.info("Total elapsed time: %6.1f s", total_time)
def run_haplotag( variant_file, alignment_file, output=None, reference=None, regions=None, ignore_linked_read=False, given_samples=None, linked_read_distance_cutoff=50000, ignore_read_groups=False, haplotag_list=None, tag_supplementary=False, ): timers = StageTimer() timers.start("haplotag-run") with ExitStack() as stack: timers.start("haplotag-init") try: vcf_reader = stack.enter_context( VcfReader(variant_file, indels=True, phases=True)) except OSError as err: raise CommandLineError( "Error while loading variant file {}: {}".format( variant_file, err)) use_vcf_samples = compute_variant_file_samples_to_use( vcf_reader.samples, given_samples, ignore_read_groups) try: bam_reader = stack.enter_context( pysam.AlignmentFile(alignment_file, "rb", require_index=True)) except OSError as err: raise CommandLineError( "Error while loading alignment file {}: {}".format( alignment_file, err)) # This checks also sample compatibility with VCF shared_samples = compute_shared_samples(bam_reader, ignore_read_groups, use_vcf_samples) # Check if user has specified a subset of regions per chromosome user_regions = normalize_user_regions(regions, bam_reader.references) phased_input_reader = stack.enter_context( PhasedInputReader([alignment_file], reference, NumericSampleIds(), ignore_read_groups, indels=False)) bam_writer = stack.enter_context( open_output_alignment_file(output, reference, md5_of(variant_file), bam_reader.header.to_dict())) haplotag_writer = stack.enter_context( open_haplotag_writer(haplotag_list)) timers.stop("haplotag-init") logger.debug("All input/output files initialized (time: {})".format( timers.elapsed("haplotag-init"))) timers.start("haplotag-process") n_alignments = 0 n_tagged = 0 n_multiple_phase_sets = 0 for chrom, regions in user_regions.items(): logger.debug("Processing chromosome {}".format(chrom)) # If there are no alignments for this chromosome, skip it. This allows to have # extra chromosomes in the BAM compared to the VCF as long as they are not actually # used. has_any_alignments = False for _ in bam_reader.fetch(contig=chrom): has_any_alignments = True break if not has_any_alignments: continue try: variant_table = load_chromosome_variants( vcf_reader, chrom, regions) except VcfError as e: raise CommandLineError(str(e)) if variant_table is not None: logger.debug("Preparing haplotype information") (BX_tag_to_haplotype, read_to_haplotype, n_mult) = prepare_haplotag_information( variant_table, shared_samples, phased_input_reader, regions, ignore_linked_read, linked_read_distance_cutoff, ) n_multiple_phase_sets += n_mult else: # avoid uninitialized variables BX_tag_to_haplotype = None read_to_haplotype = None for start, end in regions: logger.debug("Iterating chromosome regions") for alignment in bam_reader.fetch(contig=chrom, start=start, stop=end): n_alignments += 1 haplotype_name = "none" phaseset = "none" alignment.set_tag("HP", value=None) alignment.set_tag("PC", value=None) alignment.set_tag("PS", value=None) if variant_table is None or ignore_read( alignment, tag_supplementary): # - If no variants in VCF for this chromosome, # alignments just get written to output # - Ignored reads are simply # written to the output BAM pass else: (is_tagged, haplotype_name, phaseset) = attempt_add_phase_information( alignment, read_to_haplotype, BX_tag_to_haplotype, linked_read_distance_cutoff, ignore_linked_read, ) n_tagged += is_tagged bam_writer.write(alignment) if not (alignment.is_secondary or alignment.is_supplementary): print( alignment.query_name, haplotype_name, phaseset, chrom, sep="\t", file=haplotag_writer, ) if n_alignments % 100000 == 0: logger.debug("Processed {} alignment records.".format( n_alignments)) timers.stop("haplotag-process") logger.debug("Processing complete (time: {})".format( timers.elapsed("haplotag-process"))) timers.stop("haplotag-run") logger.info("\n== SUMMARY ==") logger.info("Total alignments processed: %12d", n_alignments) logger.info("Alignments that could be tagged: %12d", n_tagged) logger.info("Alignments spanning multiple phase sets: %12d", n_multiple_phase_sets) logger.info("haplotag - total processing time: {}".format( timers.elapsed("haplotag-run")))
def run_whatshap( phase_input_files, variant_file, reference=None, output=sys.stdout, samples=None, chromosomes=None, ignore_read_groups=False, indels=True, mapping_quality=20, read_merging=False, read_merging_error_rate=0.15, read_merging_max_error_rate=0.25, read_merging_positive_threshold=1000000, read_merging_negative_threshold=1000, max_coverage=15, full_genotyping=False, distrust_genotypes=False, include_homozygous=False, ped=None, recombrate=1.26, genmap=None, genetic_haplotyping=True, recombination_list_filename=None, tag="PS", read_list_filename=None, gl_regularizer=None, gtchange_list_filename=None, default_gq=30, write_command_line_header=True, use_ped_samples=False, algorithm="whatshap", ): """ Run WhatsHap. phase_input_files -- list of paths to BAM/CRAM/VCF files variant_file -- path to input VCF reference -- path to reference FASTA output -- path to output VCF or a file-like object samples -- names of samples to phase. an empty list means: phase all samples chromosomes -- names of chromosomes to phase. an empty list means: phase all chromosomes ignore_read_groups mapping_quality -- discard reads below this mapping quality read_merging -- whether or not to merge reads read_merging_error_rate -- probability that a nucleotide is wrong read_merging_max_error_rate -- max error rate on edge of merge graph considered read_merging_positive_threshold -- threshold on the ratio of the two probabilities read_merging_negative_threshold -- threshold on the opposite ratio of positive threshold max_coverage full_genotyping distrust_genotypes include_homozygous genetic_haplotyping -- in ped mode, merge disconnected blocks based on genotype status recombination_list_filename -- filename to write putative recombination events to tag -- How to store phasing info in the VCF, can be 'PS' or 'HP' read_list_filename -- name of file to write list of used reads to algorithm -- algorithm to use, can be 'whatshap' or 'hapchat' gl_regularizer -- float to be passed as regularization constant to GenotypeLikelihoods.as_phred gtchange_list_filename -- filename to write list of changed genotypes to default_gq -- genotype likelihood to be used when GL or PL not available write_command_line_header -- whether to add a ##commandline header to the output VCF """ if algorithm == "hapchat" and ped is not None: raise CommandLineError( "The hapchat algorithm cannot do pedigree phasing") timers = StageTimer() logger.info( "This is WhatsHap %s running under Python %s", __version__, platform.python_version(), ) if full_genotyping: distrust_genotypes = True include_homozygous = True numeric_sample_ids = NumericSampleIds() if write_command_line_header: command_line = "(whatshap {}) {}".format(__version__, " ".join(sys.argv[1:])) else: command_line = None if read_merging: read_merger = ReadMerger( read_merging_error_rate, read_merging_max_error_rate, read_merging_positive_threshold, read_merging_negative_threshold, ) else: read_merger = DoNothingReadMerger() with ExitStack() as stack: try: vcf_writer = stack.enter_context( PhasedVcfWriter( command_line=command_line, in_path=variant_file, out_file=output, tag=tag, )) except (OSError, VcfError) as e: raise CommandLineError(e) phased_input_reader = stack.enter_context( PhasedInputReader( phase_input_files, reference, numeric_sample_ids, ignore_read_groups, mapq_threshold=mapping_quality, indels=indels, )) show_phase_vcfs = phased_input_reader.has_vcfs # Only read genotype likelihoods from VCFs when distrusting genotypes vcf_reader = stack.enter_context( VcfReader(variant_file, indels=indels, genotype_likelihoods=distrust_genotypes)) if ignore_read_groups and not samples and len(vcf_reader.samples) > 1: raise CommandLineError( "When using --ignore-read-groups on a VCF with " "multiple samples, --sample must also be used.") if not samples: samples = vcf_reader.samples # if --use-ped-samples is set, use only samples from PED file if ped and use_ped_samples: samples = PedReader(ped).samples() raise_if_any_sample_not_in_vcf(vcf_reader, samples) if ped and genmap: logger.info( "Using region-specific recombination rates from genetic map %s.", genmap, ) try: recombination_cost_computer = GeneticMapRecombinationCostComputer( genmap) except ParseError as e: raise CommandLineError(e) else: if ped: logger.info("Using uniform recombination rate of %g cM/Mb.", recombrate) recombination_cost_computer = UniformRecombinationCostComputer( recombrate) samples = frozenset(samples) families, family_trios = setup_families(samples, ped, numeric_sample_ids, max_coverage) read_list = None if read_list_filename: read_list = stack.enter_context(ReadList(read_list_filename)) if algorithm == "hapchat": logger.warning( "On which haplotype a read occurs in the inferred solution is not yet " "implemented in hapchat, and so the corresponding column in the " "read list file contains no information about this") with timers("parse_phasing_vcfs"): # TODO should this be done in PhasedInputReader.__init__? phased_input_reader.read_vcfs() for variant_table in timers.iterate("parse_vcf", vcf_reader): chromosome = variant_table.chromosome if (not chromosomes) or (chromosome in chromosomes): logger.info("======== Working on chromosome %r", chromosome) else: logger.info( "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)", chromosome, ) with timers("write_vcf"): superreads, components = dict(), dict() vcf_writer.write(chromosome, superreads, components) continue if full_genotyping: positions = [v.position for v in variant_table.variants] for sample in samples: logger.info("---- Initial genotyping of %s", sample) with timers("read_bam"): bam_sample = None if ignore_read_groups else sample readset, vcf_source_ids = phased_input_reader.read( chromosome, variant_table.variants, bam_sample, read_vcf=False, ) readset.sort() # TODO can be removed genotypes, genotype_likelihoods = compute_genotypes( readset, positions) variant_table.set_genotypes_of(sample, genotypes) variant_table.set_genotype_likelihoods_of( sample, [ GenotypeLikelihoods(gl) for gl in genotype_likelihoods ], ) # These two variables hold the phasing results for all samples superreads, components = dict(), dict() # Iterate over all families to process, i.e. a separate DP table is created # for each family. # TODO: Can the body of this loop be factored out into a phase_family function? for representative_sample, family in sorted(families.items()): if len(family) == 1: logger.info("---- Processing individual %s", representative_sample) else: logger.info("---- Processing family with individuals: %s", ",".join(family)) max_coverage_per_sample = max(1, max_coverage // len(family)) logger.info("Using maximum coverage per sample of %dX", max_coverage_per_sample) trios = family_trios[representative_sample] assert len(family) == 1 or len(trios) > 0 homozygous_positions, phasable_variant_table = find_phaseable_variants( family, include_homozygous, trios, variant_table) # Get the reads belonging to each sample readsets = dict() # TODO this could become a list for sample in family: with timers("read_bam"): readset, vcf_source_ids = phased_input_reader.read( chromosome, phasable_variant_table.variants, sample, ) # TODO: Read selection done w.r.t. all variants, where using heterozygous # variants only would probably give better results. with timers("select"): readset = readset.subset([ i for i, read in enumerate(readset) if len(read) >= 2 ]) logger.info( "Kept %d reads that cover at least two variants each", len(readset), ) merged_reads = read_merger.merge(readset) selected_reads = select_reads( merged_reads, max_coverage_per_sample, preferred_source_ids=vcf_source_ids, ) readsets[sample] = selected_reads if len(family) == 1 and not distrust_genotypes: # When having a pedigree (len(family) > 1), blocks are also merged after # phasing based on the pedigree information and these statistics are not # so useful. When distrust_genotypes, genotypes can change during phasing # and so can the block structure. So don't print these stats in those cases log_best_case_phasing_info(readset, selected_reads) all_reads = merge_readsets(readsets) # Determine which variants can (in principle) be phased accessible_positions = sorted(all_reads.get_positions()) logger.info( "Variants covered by at least one phase-informative " "read in at least one individual after read selection: %d", len(accessible_positions), ) if len(family) > 1 and genetic_haplotyping: # In case of genetic haplotyping, also retain all positions homozygous # in at least one individual (because they might be phased based on genotypes) accessible_positions = sorted( set(accessible_positions).union(homozygous_positions)) logger.info( "Variants either covered by phase-informative read or homozygous " "in at least one individual: %d", len(accessible_positions), ) # Keep only accessible positions phasable_variant_table.subset_rows_by_position( accessible_positions) assert len(phasable_variant_table.variants) == len( accessible_positions) pedigree = create_pedigree( default_gq, distrust_genotypes, family, gl_regularizer, numeric_sample_ids, phasable_variant_table, trios, ) recombination_costs = recombination_cost_computer.compute( accessible_positions) # Finally, run phasing algorithm with timers("phase"): problem_name = "MEC" if len(family) == 1 else "PedMEC" logger.info( "Phasing %d sample%s by solving the %s problem ...", len(family), plural_s(len(family)), problem_name, ) if algorithm == "hapchat": dp_table = HapChatCore(all_reads) else: dp_table = PedigreeDPTable( all_reads, recombination_costs, pedigree, distrust_genotypes, accessible_positions, ) superreads_list, transmission_vector = dp_table.get_super_reads( ) optimal_cost = dp_table.get_optimal_cost() logger.info("%s cost: %d", problem_name, optimal_cost) with timers("components"): master_block = None heterozygous_positions_by_sample = None # If we distrusted genotypes, we need to re-determine which sites are h**o-/heterozygous after phasing if distrust_genotypes: hom_in_any_sample = set() heterozygous_positions_by_sample = {} heterozygous_gts = frozenset({(0, 1), (1, 0)}) homozygous_gts = frozenset({(0, 0), (1, 1)}) for sample, sample_superreads in zip( family, superreads_list): hets = set() for v1, v2 in zip(*sample_superreads): assert v1.position == v2.position if v1.position not in accessible_positions: continue gt = (v1.allele, v2.allele) if gt in heterozygous_gts: hets.add(v1.position) elif gt in homozygous_gts: hom_in_any_sample.add(v1.position) heterozygous_positions_by_sample[ numeric_sample_ids[sample]] = hets if len(family) > 1 and genetic_haplotyping: master_block = sorted(hom_in_any_sample) else: if len(family) > 1 and genetic_haplotyping: master_block = sorted( set(homozygous_positions).intersection( set(accessible_positions))) overall_components = find_components( accessible_positions, all_reads, master_block, heterozygous_positions_by_sample, ) n_phased_blocks = len(set(overall_components.values())) logger.info("No. of phased blocks: %d", n_phased_blocks) largest_component = find_largest_component( overall_components) if len(largest_component) > 0: logger.info( "Largest component contains %d variants (%.1f%% of accessible variants) between position %d and %d", len(largest_component), len(largest_component) * 100.0 / len(accessible_positions), largest_component[0] + 1, largest_component[-1] + 1, ) if recombination_list_filename: n_recombinations = write_recombination_list( recombination_list_filename, chromosome, accessible_positions, overall_components, recombination_costs, transmission_vector, trios, ) logger.info( "Total no. of detected recombination events: %d", n_recombinations, ) # Superreads in superreads_list are in the same order as individuals were added to the pedigree for sample, sample_superreads in zip(family, superreads_list): superreads[sample] = sample_superreads assert len(sample_superreads) == 2 assert (sample_superreads[0].sample_id == sample_superreads[1].sample_id == numeric_sample_ids[sample]) # identical for all samples components[sample] = overall_components if read_list: read_list.write( all_reads, dp_table.get_optimal_partitioning(), components, numeric_sample_ids, ) with timers("write_vcf"): logger.info("======== Writing VCF") changed_genotypes = vcf_writer.write(chromosome, superreads, components) logger.info("Done writing VCF") if changed_genotypes: assert distrust_genotypes logger.info("Changed %d genotypes while writing VCF", len(changed_genotypes)) if gtchange_list_filename: logger.info("Writing list of changed genotypes to %r", gtchange_list_filename) write_changed_genotypes(gtchange_list_filename, changed_genotypes) logger.debug("Chromosome %r finished", chromosome) log_time_and_memory_usage(timers, show_phase_vcfs=show_phase_vcfs)