Python PedigreeDPTable примеры использования

Язык программирования: Python

Пространство имен/Пакет: whatshap.core

Класс/Тип: PedigreeDPTable

Примеров на hotexamples.com: 11

Python PedigreeDPTable - 11 примеров найдено. Это лучшие примеры Python кода для whatshap.core.PedigreeDPTable, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

PedigreeDPTable(11)

get_super_reads(4)

get_optimal_cost(3)

get_optimal_partitioning(2)

Пример #1

Показать файл

def bipartition(reads):
    positions = reads.get_positions()
    # create genotypes over your variants: all heterozygous (=1)
    genotypes = canonic_index_list_to_biallelic_gt_list([1] * len(positions))
    # genotype likelihoods are None
    genotype_likelihoods = [None] * len(positions)
    # create empty pedigree
    pedigree = Pedigree(NumericSampleIds())
    # add one individual to pedigree
    pedigree.add_individual('individual0', genotypes, genotype_likelihoods)
    # recombination cost vector, irrelevant if one using one individual
    recombcost = [1] * len(positions)

    # run the core phasing algorithm, creating a DP table
    dp_table = PedigreeDPTable(reads,
                               recombcost,
                               pedigree,
                               distrust_genotypes=False)
    phasing, transmission_vector = dp_table.get_super_reads()
    #print('PHASING')
    #print(phasing[0])
    #print(phasing[0][0])
    #print(phasing[0][1])
    mec_score = dp_table.get_optimal_cost()
    eprint("MEC Score:", mec_score)
    eprint("MEC Score / readset length:",
           float(mec_score) / float(readset_length))

    # In case the bi-partition of reads is of interest:
    partition = dp_table.get_optimal_partitioning()
    #print(partition)
    eprint("partition fraction:", sum(partition) / float(len(partition)))

    return phasing, partition

Пример #2

Показать файл

def test_phase_empty_trio():
	rs = ReadSet()
	recombcost = []
	pedigree = Pedigree(NumericSampleIds())
	pedigree.add_individual('individual0', [])
	pedigree.add_individual('individual1', [])
	pedigree.add_individual('individual2', [])
	pedigree.add_relationship('individual0', 'individual1', 'individual2')
	dp_table = PedigreeDPTable(rs, recombcost, pedigree)
	(superreadsm, superreadsf, superreadsc), transmission_vector = dp_table.get_super_reads()

Пример #3

Показать файл

def phase_MAV(reads, n_alleles, all_het, genos, genotypes, weights=None):
    readset = string_to_readset(reads, n_alleles)
    positions = readset.get_positions()
    for all_heterozygous in all_het:
        recombcost = [1] * len(
            positions)  # recombination costs 1, should not occur
        pedigree = Pedigree(NumericSampleIds())
        genotype_likelihoods = [
            None if all_heterozygous else PhredGenotypeLikelihoods(genos)
        ] * len(positions)
        pedigree.add_individual(
            'individual0', genotypes,
            genotype_likelihoods)  # all genotypes heterozygous
        dp_table = PedigreeDPTable(readset,
                                   recombcost,
                                   pedigree,
                                   distrust_genotypes=not all_heterozygous)
        superreads_list, transmission_vector = dp_table.get_super_reads()
        cost = dp_table.get_optimal_cost()
    return superreads_list, transmission_vector, cost

Пример #4

Показать файл

Файл: test_phasing.py Проект: pontushojer/whatshap

def test_phase_empty_readset(algorithm):
    rs = ReadSet()
    recombcost = [1, 1]
    genotypes = canonic_index_list_to_biallelic_gt_list([1, 1])
    pedigree = Pedigree(NumericSampleIds())
    genotype_likelihoods = [None, None]
    pedigree.add_individual("individual0", genotypes, genotype_likelihoods)

    if algorithm == "hapchat":
        dp_table = HapChatCore(rs)
    else:
        dp_table = PedigreeDPTable(rs, recombcost, pedigree)

    _ = dp_table.get_super_reads()

Пример #5

Показать файл

Файл: testverification.py Проект: adamnovak/gwhatshap

def verify(rs, all_heterozygous=False):
    positions = rs.get_positions()
    recombcost = [1] * len(
        positions)  # recombination costs 1, should not occur
    pedigree = Pedigree(NumericSampleIds())
    genotype_likelihoods = [
        None if all_heterozygous else PhredGenotypeLikelihoods(0, 0, 0)
    ] * len(positions)
    pedigree.add_individual('individual0', [1] * len(positions),
                            genotype_likelihoods)  # all genotypes heterozygous
    dp_table = PedigreeDPTable(rs,
                               recombcost,
                               pedigree,
                               distrust_genotypes=not all_heterozygous)
    verify_mec_score_and_partitioning(dp_table, rs)

Пример #6

Показать файл

Файл: test_phasing.py Проект: sarangian/WHdenovo

def test_phase_empty_readset(algorithm):
    rs = ReadSet()
    recombcost = [1, 1]
    genotypes = [1, 1]
    pedigree = Pedigree(NumericSampleIds())
    genotype_likelihoods = [None, None]
    pedigree.add_individual('individual0', genotypes, genotype_likelihoods)

    dp_table = None
    if algorithm == 'hapchat':
        dp_table = HapChatCore(rs)
    else:
        dp_table = PedigreeDPTable(rs, recombcost, pedigree)

    superreads = dp_table.get_super_reads()

Пример #7

Показать файл

Файл: test_verification.py Проект: pontushojer/whatshap

def verify(rs, all_heterozygous=False):
    positions = rs.get_positions()
    # recombination costs 1, should not occur
    recombcost = [1] * len(positions)
    pedigree = Pedigree(NumericSampleIds())
    genotype_likelihoods = [
        None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0])
    ] * len(positions)
    # all genotypes heterozygous
    pedigree.add_individual(
        "individual0",
        [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))],
        genotype_likelihoods,
    )
    dp_table = PedigreeDPTable(rs, recombcost, pedigree, distrust_genotypes=not all_heterozygous)
    verify_mec_score_and_partitioning(dp_table, rs)

Пример #8

Показать файл

def phase_pedigree(reads, recombcost, pedigree, distrust_genotypes=False, positions=None):
	rs = string_to_readset_pedigree(reads)
	dp_table = PedigreeDPTable(rs, recombcost, pedigree, distrust_genotypes, positions)
	superreads_list, transmission_vector = dp_table.get_super_reads()
	cost = dp_table.get_optimal_cost()
	for superreads in superreads_list:
		for sr in superreads:
			print(sr)
	print('Cost:', dp_table.get_optimal_cost())
	print('Transmission vector:', transmission_vector)
	print('Partition:', dp_table.get_optimal_partitioning())
	return superreads_list, transmission_vector, cost

Пример #9

Показать файл

def run_whatshap(
    phase_input_files: List[str],
    variant_file: str,
    reference: Union[None, bool, str] = False,
    output: TextIO = sys.stdout,
    samples: List[str] = None,
    chromosomes: Optional[List[str]] = None,
    ignore_read_groups: bool = False,
    indels: bool = True,
    mapping_quality: int = 20,
    read_merging: bool = False,
    read_merging_error_rate: float = 0.15,
    read_merging_max_error_rate: float = 0.25,
    read_merging_positive_threshold: int = 1000000,
    read_merging_negative_threshold: int = 1000,
    max_coverage: int = 15,
    distrust_genotypes: bool = False,
    include_homozygous: bool = False,
    ped: Optional[str] = None,
    recombrate: float = 1.26,
    genmap: Optional[str] = None,
    genetic_haplotyping: bool = True,
    recombination_list_filename: Optional[str] = None,
    tag: str = "PS",
    read_list_filename: Optional[str] = None,
    gl_regularizer: Optional[float] = None,
    gtchange_list_filename: Optional[str] = None,
    default_gq: int = 30,
    write_command_line_header: bool = True,
    use_ped_samples: bool = False,
    algorithm: str = "whatshap",
):
    """
    Run WhatsHap.

    phase_input_files -- list of paths to BAM/CRAM/VCF files
    variant_file -- path to input VCF
    reference -- path to reference FASTA. If False: skip realignment. If None: complain if reference needed.
    output -- path to output VCF or a file-like object
    samples -- names of samples to phase. an empty list means: phase all samples
    chromosomes -- names of chromosomes to phase. an empty list means: phase all chromosomes
    ignore_read_groups
    mapping_quality -- discard reads below this mapping quality
    read_merging -- whether or not to merge reads
    read_merging_error_rate -- probability that a nucleotide is wrong
    read_merging_max_error_rate -- max error rate on edge of merge graph considered
    read_merging_positive_threshold -- threshold on the ratio of the two probabilities
    read_merging_negative_threshold -- threshold on the opposite ratio of positive threshold
    max_coverage
    distrust_genotypes
    include_homozygous
    genetic_haplotyping -- in ped mode, merge disconnected blocks based on genotype status
    recombination_list_filename -- filename to write putative recombination events to
    tag -- How to store phasing info in the VCF, can be 'PS' or 'HP'
    read_list_filename -- name of file to write list of used reads to
    algorithm -- algorithm to use, can be 'whatshap' or 'hapchat'
    gl_regularizer -- float to be passed as regularization constant to GenotypeLikelihoods.as_phred
    gtchange_list_filename -- filename to write list of changed genotypes to
    default_gq -- genotype likelihood to be used when GL or PL not available
    write_command_line_header -- whether to add a ##commandline header to the output VCF
    """

    if algorithm == "hapchat" and ped is not None:
        raise CommandLineError(
            "The hapchat algorithm cannot do pedigree phasing")

    timers = StageTimer()
    logger.info(
        f"This is WhatsHap {__version__} running under Python {platform.python_version()}"
    )
    numeric_sample_ids = NumericSampleIds()
    command_line: Optional[str]
    if write_command_line_header:
        command_line = "(whatshap {}) {}".format(__version__,
                                                 " ".join(sys.argv[1:]))
    else:
        command_line = None

    read_merger: ReadMergerBase
    if read_merging:
        read_merger = ReadMerger(
            read_merging_error_rate,
            read_merging_max_error_rate,
            read_merging_positive_threshold,
            read_merging_negative_threshold,
        )
    else:
        read_merger = DoNothingReadMerger()

    with ExitStack() as stack:
        try:
            vcf_writer = stack.enter_context(
                PhasedVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=output,
                    tag=tag,
                    indels=indels,
                ))
        except (OSError, VcfError) as e:
            raise CommandLineError(e)

        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                None if reference is False else reference,
                numeric_sample_ids,
                ignore_read_groups,
                mapq_threshold=mapping_quality,
                indels=indels,
            ))
        show_phase_vcfs = phased_input_reader.has_vcfs

        if phased_input_reader.has_alignments and reference is None:
            raise CommandLineError(
                "A reference FASTA needs to be provided with -r/--reference; "
                "or use --no-reference at the expense of phasing quality.")

        # Only read genotype likelihoods from VCFs when distrusting genotypes
        vcf_reader = stack.enter_context(
            VcfReader(variant_file,
                      indels=indels,
                      genotype_likelihoods=distrust_genotypes))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        # if --use-ped-samples is set, use only samples from PED file
        if ped and use_ped_samples:
            samples = PedReader(ped).samples()

        raise_if_any_sample_not_in_vcf(vcf_reader, samples)

        recombination_cost_computer = make_recombination_cost_computer(
            ped, genmap, recombrate)

        families, family_trios = setup_families(samples, ped, max_coverage)
        del samples
        for trios in family_trios.values():
            for trio in trios:
                # Ensure that all mentioned individuals have a numeric id
                _ = numeric_sample_ids[trio.child]

        read_list = None
        if read_list_filename:
            read_list = stack.enter_context(ReadList(read_list_filename))
            if algorithm == "hapchat":
                logger.warning(
                    "On which haplotype a read occurs in the inferred solution is not yet "
                    "implemented in hapchat, and so the corresponding column in the "
                    "read list file contains no information about this")

        with timers("parse_phasing_vcfs"):
            # TODO should this be done in PhasedInputReader.__init__?
            phased_input_reader.read_vcfs()

        superreads: Dict[str, ReadSet]
        components: Dict
        for variant_table in timers.iterate("parse_vcf", vcf_reader):
            chromosome = variant_table.chromosome
            if (not chromosomes) or (chromosome in chromosomes):
                logger.info("======== Working on chromosome %r", chromosome)
            else:
                logger.info(
                    "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                    chromosome,
                )
                with timers("write_vcf"):
                    superreads, components = dict(), dict()
                    vcf_writer.write(chromosome, superreads, components)
                continue

            # These two variables hold the phasing results for all samples
            superreads, components = dict(), dict()

            # Iterate over all families to process, i.e. a separate DP table is created
            # for each family.
            # TODO: Can the body of this loop be factored out into a phase_family function?
            for representative_sample, family in sorted(families.items()):
                if len(family) == 1:
                    logger.info("---- Processing individual %s",
                                representative_sample)
                else:
                    logger.info("---- Processing family with individuals: %s",
                                ",".join(family))
                max_coverage_per_sample = max(1, max_coverage // len(family))
                logger.info("Using maximum coverage per sample of %dX",
                            max_coverage_per_sample)
                trios = family_trios[representative_sample]
                assert len(family) == 1 or len(trios) > 0

                homozygous_positions, phasable_variant_table = find_phaseable_variants(
                    family, include_homozygous, trios, variant_table)

                # Get the reads belonging to each sample
                readsets = dict()  # TODO this could become a list
                for sample in family:
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome, phasable_variant_table.variants,
                            sample)

                    # TODO: Read selection done w.r.t. all variants, where using heterozygous
                    #  variants only would probably give better results.
                    with timers("select"):
                        readset = readset.subset([
                            i for i, read in enumerate(readset)
                            if len(read) >= 2
                        ])
                        logger.info(
                            "Kept %d reads that cover at least two variants each",
                            len(readset))
                        merged_reads = read_merger.merge(readset)
                        selected_reads = select_reads(
                            merged_reads,
                            max_coverage_per_sample,
                            preferred_source_ids=vcf_source_ids,
                        )

                    readsets[sample] = selected_reads
                    if len(family) == 1 and not distrust_genotypes:
                        # When having a pedigree (len(family) > 1), blocks are also merged after
                        # phasing based on the pedigree information and these statistics are not
                        # so useful. When distrust_genotypes, genotypes can change during phasing
                        # and so can the block structure. So don't print these stats in those cases
                        log_best_case_phasing_info(readset, selected_reads)

                all_reads = merge_readsets(readsets)

                # Determine which variants can (in principle) be phased
                accessible_positions = sorted(all_reads.get_positions())
                logger.info(
                    "Variants covered by at least one phase-informative "
                    "read in at least one individual after read selection: %d",
                    len(accessible_positions),
                )
                if len(family) > 1 and genetic_haplotyping:
                    # In case of genetic haplotyping, also retain all positions homozygous
                    # in at least one individual (because they might be phased based on genotypes)
                    accessible_positions = sorted(
                        set(accessible_positions).union(homozygous_positions))
                    logger.info(
                        "Variants either covered by phase-informative read or homozygous "
                        "in at least one individual: %d",
                        len(accessible_positions),
                    )

                # Keep only accessible positions
                phasable_variant_table.subset_rows_by_position(
                    accessible_positions)
                assert len(phasable_variant_table.variants) == len(
                    accessible_positions)

                pedigree = create_pedigree(
                    default_gq,
                    distrust_genotypes,
                    family,
                    gl_regularizer,
                    numeric_sample_ids,
                    phasable_variant_table,
                    trios,
                )
                recombination_costs = recombination_cost_computer.compute(
                    accessible_positions)

                # Finally, run phasing algorithm
                with timers("phase"):
                    problem_name = "MEC" if len(family) == 1 else "PedMEC"
                    logger.info(
                        "Phasing %d sample%s by solving the %s problem ...",
                        len(family),
                        plural_s(len(family)),
                        problem_name,
                    )

                    dp_table: Union[HapChatCore, PedigreeDPTable]
                    if algorithm == "hapchat":
                        dp_table = HapChatCore(all_reads)
                    else:
                        dp_table = PedigreeDPTable(
                            all_reads,
                            recombination_costs,
                            pedigree,
                            distrust_genotypes,
                            accessible_positions,
                        )

                    superreads_list, transmission_vector = dp_table.get_super_reads(
                    )
                    logger.info("%s cost: %d", problem_name,
                                dp_table.get_optimal_cost())

                with timers("components"):
                    overall_components = compute_overall_components(
                        accessible_positions,
                        all_reads,
                        distrust_genotypes,
                        family,
                        genetic_haplotyping,
                        homozygous_positions,
                        numeric_sample_ids,
                        superreads_list,
                    )
                    log_component_stats(overall_components,
                                        len(accessible_positions))

                if recombination_list_filename:
                    n_recombinations = write_recombination_list(
                        recombination_list_filename,
                        chromosome,
                        accessible_positions,
                        overall_components,
                        recombination_costs,
                        transmission_vector,
                        trios,
                    )
                    logger.info(
                        "Total no. of detected recombination events: %d",
                        n_recombinations)

                # Superreads in superreads_list are in the same order as individuals were added to the pedigree
                for sample, sample_superreads in zip(family, superreads_list):
                    superreads[sample] = sample_superreads
                    assert len(sample_superreads) == 2
                    assert (sample_superreads[0].sample_id ==
                            sample_superreads[1].sample_id ==
                            numeric_sample_ids[sample])
                    # identical for all samples
                    components[sample] = overall_components

                if read_list:
                    read_list.write(
                        all_reads,
                        dp_table.get_optimal_partitioning(),
                        components,
                        numeric_sample_ids,
                    )

            with timers("write_vcf"):
                logger.info("======== Writing VCF")
                changed_genotypes = vcf_writer.write(chromosome, superreads,
                                                     components)
                logger.info("Done writing VCF")
                if changed_genotypes:
                    assert distrust_genotypes
                    logger.info("Changed %d genotypes while writing VCF",
                                len(changed_genotypes))

            if gtchange_list_filename:
                logger.info("Writing list of changed genotypes to %r",
                            gtchange_list_filename)
                write_changed_genotypes(gtchange_list_filename,
                                        changed_genotypes)

            logger.debug("Chromosome %r finished", chromosome)

    log_time_and_memory_usage(timers, show_phase_vcfs=show_phase_vcfs)

Пример #10

Показать файл

Файл: test_phasing.py Проект: pontushojer/whatshap

def check_phasing_single_individual(reads, algorithm="whatshap", weights=None):
    # 0) set up read set
    readset = string_to_readset(reads, weights)
    positions = readset.get_positions()

    # for hapchat
    if algorithm == "hapchat":
        dp_table = HapChatCore(readset)
        superreads = dp_table.get_super_reads()
        cost = dp_table.get_optimal_cost()
        partition = dp_table.get_optimal_partitioning()
        compare_phasing_brute_force(superreads[0][0], cost, partition, readset,
                                    True, weights, algorithm)
        return

    # 1) Phase using PedMEC code for single individual
    for all_heterozygous in [False, True]:
        recombcost = [1] * len(
            positions)  # recombination costs 1, should not occur
        pedigree = Pedigree(NumericSampleIds())
        genotype_likelihoods = [
            None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0])
        ] * len(positions)
        pedigree.add_individual(
            "individual0",
            [canonic_index_to_biallelic_gt(1) for i in range(len(positions))],
            genotype_likelihoods,
        )  # all genotypes heterozygous
        dp_table = PedigreeDPTable(readset,
                                   recombcost,
                                   pedigree,
                                   distrust_genotypes=not all_heterozygous)
        superreads, transmission_vector = dp_table.get_super_reads()
        cost = dp_table.get_optimal_cost()
        # TODO: transmission vectors not returned properly, see issue 73
        assert len(set(transmission_vector)) == 1
        partition = dp_table.get_optimal_partitioning()
        compare_phasing_brute_force(superreads[0], cost, partition, readset,
                                    all_heterozygous, weights)

    # 2) Phase using PedMEC code for trios with two "empty" individuals (i.e. having no reads)
    for all_heterozygous in [False, True]:
        recombcost = [1] * len(
            positions)  # recombination costs 1, should not occur
        pedigree = Pedigree(NumericSampleIds())
        genotype_likelihoods = [
            None if all_heterozygous else PhredGenotypeLikelihoods([0, 0, 0])
        ] * len(positions)
        pedigree.add_individual(
            "individual0",
            [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))],
            genotype_likelihoods,
        )  # all genotypes heterozygous
        pedigree.add_individual(
            "individual1",
            [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))],
            genotype_likelihoods,
        )  # all genotypes heterozygous
        pedigree.add_individual(
            "individual2",
            [canonic_index_to_biallelic_gt(1) for _ in range(len(positions))],
            genotype_likelihoods,
        )  # all genotypes heterozygous
        pedigree.add_relationship("individual0", "individual1", "individual2")
        dp_table = PedigreeDPTable(readset,
                                   recombcost,
                                   pedigree,
                                   distrust_genotypes=not all_heterozygous)
        cost = dp_table.get_optimal_cost()
        superreads, transmission_vector = dp_table.get_super_reads()
        assert len(set(transmission_vector)) == 1
        partition = dp_table.get_optimal_partitioning()
        compare_phasing_brute_force(superreads[0], cost, partition, readset,
                                    all_heterozygous, weights)

Пример #11

Показать файл

def run_whatshap(
    phase_input_files,
    variant_file,
    reference=None,
    output=sys.stdout,
    samples=None,
    chromosomes=None,
    ignore_read_groups=False,
    indels=True,
    mapping_quality=20,
    read_merging=False,
    read_merging_error_rate=0.15,
    read_merging_max_error_rate=0.25,
    read_merging_positive_threshold=1000000,
    read_merging_negative_threshold=1000,
    max_coverage=15,
    full_genotyping=False,
    distrust_genotypes=False,
    include_homozygous=False,
    ped=None,
    recombrate=1.26,
    genmap=None,
    genetic_haplotyping=True,
    recombination_list_filename=None,
    tag="PS",
    read_list_filename=None,
    gl_regularizer=None,
    gtchange_list_filename=None,
    default_gq=30,
    write_command_line_header=True,
    use_ped_samples=False,
    algorithm="whatshap",
):
    """
    Run WhatsHap.

    phase_input_files -- list of paths to BAM/CRAM/VCF files
    variant_file -- path to input VCF
    reference -- path to reference FASTA
    output -- path to output VCF or a file-like object
    samples -- names of samples to phase. an empty list means: phase all samples
    chromosomes -- names of chromosomes to phase. an empty list means: phase all chromosomes
    ignore_read_groups
    mapping_quality -- discard reads below this mapping quality
    read_merging -- whether or not to merge reads
    read_merging_error_rate -- probability that a nucleotide is wrong
    read_merging_max_error_rate -- max error rate on edge of merge graph considered
    read_merging_positive_threshold -- threshold on the ratio of the two probabilities
    read_merging_negative_threshold -- threshold on the opposite ratio of positive threshold
    max_coverage
    full_genotyping
    distrust_genotypes
    include_homozygous
    genetic_haplotyping -- in ped mode, merge disconnected blocks based on genotype status
    recombination_list_filename -- filename to write putative recombination events to
    tag -- How to store phasing info in the VCF, can be 'PS' or 'HP'
    read_list_filename -- name of file to write list of used reads to
    algorithm -- algorithm to use, can be 'whatshap' or 'hapchat'
    gl_regularizer -- float to be passed as regularization constant to GenotypeLikelihoods.as_phred
    gtchange_list_filename -- filename to write list of changed genotypes to
    default_gq -- genotype likelihood to be used when GL or PL not available
    write_command_line_header -- whether to add a ##commandline header to the output VCF
    """

    if algorithm == "hapchat" and ped is not None:
        raise CommandLineError(
            "The hapchat algorithm cannot do pedigree phasing")

    timers = StageTimer()
    logger.info(
        "This is WhatsHap %s running under Python %s",
        __version__,
        platform.python_version(),
    )
    if full_genotyping:
        distrust_genotypes = True
        include_homozygous = True
    numeric_sample_ids = NumericSampleIds()
    if write_command_line_header:
        command_line = "(whatshap {}) {}".format(__version__,
                                                 " ".join(sys.argv[1:]))
    else:
        command_line = None

    if read_merging:
        read_merger = ReadMerger(
            read_merging_error_rate,
            read_merging_max_error_rate,
            read_merging_positive_threshold,
            read_merging_negative_threshold,
        )
    else:
        read_merger = DoNothingReadMerger()

    with ExitStack() as stack:
        try:
            vcf_writer = stack.enter_context(
                PhasedVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=output,
                    tag=tag,
                ))
        except (OSError, VcfError) as e:
            raise CommandLineError(e)

        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                reference,
                numeric_sample_ids,
                ignore_read_groups,
                mapq_threshold=mapping_quality,
                indels=indels,
            ))
        show_phase_vcfs = phased_input_reader.has_vcfs

        # Only read genotype likelihoods from VCFs when distrusting genotypes
        vcf_reader = stack.enter_context(
            VcfReader(variant_file,
                      indels=indels,
                      genotype_likelihoods=distrust_genotypes))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        # if --use-ped-samples is set, use only samples from PED file
        if ped and use_ped_samples:
            samples = PedReader(ped).samples()

        raise_if_any_sample_not_in_vcf(vcf_reader, samples)

        if ped and genmap:
            logger.info(
                "Using region-specific recombination rates from genetic map %s.",
                genmap,
            )
            try:
                recombination_cost_computer = GeneticMapRecombinationCostComputer(
                    genmap)
            except ParseError as e:
                raise CommandLineError(e)
        else:
            if ped:
                logger.info("Using uniform recombination rate of %g cM/Mb.",
                            recombrate)
            recombination_cost_computer = UniformRecombinationCostComputer(
                recombrate)

        samples = frozenset(samples)
        families, family_trios = setup_families(samples, ped,
                                                numeric_sample_ids,
                                                max_coverage)

        read_list = None
        if read_list_filename:
            read_list = stack.enter_context(ReadList(read_list_filename))
            if algorithm == "hapchat":
                logger.warning(
                    "On which haplotype a read occurs in the inferred solution is not yet "
                    "implemented in hapchat, and so the corresponding column in the "
                    "read list file contains no information about this")

        with timers("parse_phasing_vcfs"):
            # TODO should this be done in PhasedInputReader.__init__?
            phased_input_reader.read_vcfs()

        for variant_table in timers.iterate("parse_vcf", vcf_reader):
            chromosome = variant_table.chromosome
            if (not chromosomes) or (chromosome in chromosomes):
                logger.info("======== Working on chromosome %r", chromosome)
            else:
                logger.info(
                    "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                    chromosome,
                )
                with timers("write_vcf"):
                    superreads, components = dict(), dict()
                    vcf_writer.write(chromosome, superreads, components)
                continue

            if full_genotyping:
                positions = [v.position for v in variant_table.variants]
                for sample in samples:
                    logger.info("---- Initial genotyping of %s", sample)
                    with timers("read_bam"):
                        bam_sample = None if ignore_read_groups else sample
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            variant_table.variants,
                            bam_sample,
                            read_vcf=False,
                        )
                        readset.sort()  # TODO can be removed
                        genotypes, genotype_likelihoods = compute_genotypes(
                            readset, positions)
                        variant_table.set_genotypes_of(sample, genotypes)
                        variant_table.set_genotype_likelihoods_of(
                            sample,
                            [
                                GenotypeLikelihoods(gl)
                                for gl in genotype_likelihoods
                            ],
                        )

            # These two variables hold the phasing results for all samples
            superreads, components = dict(), dict()

            # Iterate over all families to process, i.e. a separate DP table is created
            # for each family.
            # TODO: Can the body of this loop be factored out into a phase_family function?
            for representative_sample, family in sorted(families.items()):
                if len(family) == 1:
                    logger.info("---- Processing individual %s",
                                representative_sample)
                else:
                    logger.info("---- Processing family with individuals: %s",
                                ",".join(family))
                max_coverage_per_sample = max(1, max_coverage // len(family))
                logger.info("Using maximum coverage per sample of %dX",
                            max_coverage_per_sample)
                trios = family_trios[representative_sample]
                assert len(family) == 1 or len(trios) > 0

                homozygous_positions, phasable_variant_table = find_phaseable_variants(
                    family, include_homozygous, trios, variant_table)

                # Get the reads belonging to each sample
                readsets = dict()  # TODO this could become a list
                for sample in family:
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            phasable_variant_table.variants,
                            sample,
                        )

                    # TODO: Read selection done w.r.t. all variants, where using heterozygous
                    #  variants only would probably give better results.
                    with timers("select"):
                        readset = readset.subset([
                            i for i, read in enumerate(readset)
                            if len(read) >= 2
                        ])
                        logger.info(
                            "Kept %d reads that cover at least two variants each",
                            len(readset),
                        )
                        merged_reads = read_merger.merge(readset)
                        selected_reads = select_reads(
                            merged_reads,
                            max_coverage_per_sample,
                            preferred_source_ids=vcf_source_ids,
                        )

                    readsets[sample] = selected_reads
                    if len(family) == 1 and not distrust_genotypes:
                        # When having a pedigree (len(family) > 1), blocks are also merged after
                        # phasing based on the pedigree information and these statistics are not
                        # so useful. When distrust_genotypes, genotypes can change during phasing
                        # and so can the block structure. So don't print these stats in those cases
                        log_best_case_phasing_info(readset, selected_reads)

                all_reads = merge_readsets(readsets)

                # Determine which variants can (in principle) be phased
                accessible_positions = sorted(all_reads.get_positions())
                logger.info(
                    "Variants covered by at least one phase-informative "
                    "read in at least one individual after read selection: %d",
                    len(accessible_positions),
                )
                if len(family) > 1 and genetic_haplotyping:
                    # In case of genetic haplotyping, also retain all positions homozygous
                    # in at least one individual (because they might be phased based on genotypes)
                    accessible_positions = sorted(
                        set(accessible_positions).union(homozygous_positions))
                    logger.info(
                        "Variants either covered by phase-informative read or homozygous "
                        "in at least one individual: %d",
                        len(accessible_positions),
                    )

                # Keep only accessible positions
                phasable_variant_table.subset_rows_by_position(
                    accessible_positions)
                assert len(phasable_variant_table.variants) == len(
                    accessible_positions)

                pedigree = create_pedigree(
                    default_gq,
                    distrust_genotypes,
                    family,
                    gl_regularizer,
                    numeric_sample_ids,
                    phasable_variant_table,
                    trios,
                )

                recombination_costs = recombination_cost_computer.compute(
                    accessible_positions)

                # Finally, run phasing algorithm
                with timers("phase"):
                    problem_name = "MEC" if len(family) == 1 else "PedMEC"
                    logger.info(
                        "Phasing %d sample%s by solving the %s problem ...",
                        len(family),
                        plural_s(len(family)),
                        problem_name,
                    )

                    if algorithm == "hapchat":
                        dp_table = HapChatCore(all_reads)
                    else:
                        dp_table = PedigreeDPTable(
                            all_reads,
                            recombination_costs,
                            pedigree,
                            distrust_genotypes,
                            accessible_positions,
                        )

                    superreads_list, transmission_vector = dp_table.get_super_reads(
                    )
                    optimal_cost = dp_table.get_optimal_cost()
                    logger.info("%s cost: %d", problem_name, optimal_cost)

                with timers("components"):
                    master_block = None
                    heterozygous_positions_by_sample = None
                    # If we distrusted genotypes, we need to re-determine which sites are h**o-/heterozygous after phasing
                    if distrust_genotypes:
                        hom_in_any_sample = set()
                        heterozygous_positions_by_sample = {}
                        heterozygous_gts = frozenset({(0, 1), (1, 0)})
                        homozygous_gts = frozenset({(0, 0), (1, 1)})
                        for sample, sample_superreads in zip(
                                family, superreads_list):
                            hets = set()
                            for v1, v2 in zip(*sample_superreads):
                                assert v1.position == v2.position
                                if v1.position not in accessible_positions:
                                    continue
                                gt = (v1.allele, v2.allele)
                                if gt in heterozygous_gts:
                                    hets.add(v1.position)
                                elif gt in homozygous_gts:
                                    hom_in_any_sample.add(v1.position)
                            heterozygous_positions_by_sample[
                                numeric_sample_ids[sample]] = hets
                        if len(family) > 1 and genetic_haplotyping:
                            master_block = sorted(hom_in_any_sample)
                    else:
                        if len(family) > 1 and genetic_haplotyping:
                            master_block = sorted(
                                set(homozygous_positions).intersection(
                                    set(accessible_positions)))
                    overall_components = find_components(
                        accessible_positions,
                        all_reads,
                        master_block,
                        heterozygous_positions_by_sample,
                    )
                    n_phased_blocks = len(set(overall_components.values()))
                    logger.info("No. of phased blocks: %d", n_phased_blocks)
                    largest_component = find_largest_component(
                        overall_components)
                    if len(largest_component) > 0:
                        logger.info(
                            "Largest component contains %d variants (%.1f%% of accessible variants) between position %d and %d",
                            len(largest_component),
                            len(largest_component) * 100.0 /
                            len(accessible_positions),
                            largest_component[0] + 1,
                            largest_component[-1] + 1,
                        )

                if recombination_list_filename:
                    n_recombinations = write_recombination_list(
                        recombination_list_filename,
                        chromosome,
                        accessible_positions,
                        overall_components,
                        recombination_costs,
                        transmission_vector,
                        trios,
                    )
                    logger.info(
                        "Total no. of detected recombination events: %d",
                        n_recombinations,
                    )

                # Superreads in superreads_list are in the same order as individuals were added to the pedigree
                for sample, sample_superreads in zip(family, superreads_list):
                    superreads[sample] = sample_superreads
                    assert len(sample_superreads) == 2
                    assert (sample_superreads[0].sample_id ==
                            sample_superreads[1].sample_id ==
                            numeric_sample_ids[sample])
                    # identical for all samples
                    components[sample] = overall_components

                if read_list:
                    read_list.write(
                        all_reads,
                        dp_table.get_optimal_partitioning(),
                        components,
                        numeric_sample_ids,
                    )

            with timers("write_vcf"):
                logger.info("======== Writing VCF")
                changed_genotypes = vcf_writer.write(chromosome, superreads,
                                                     components)
                logger.info("Done writing VCF")
                if changed_genotypes:
                    assert distrust_genotypes
                    logger.info("Changed %d genotypes while writing VCF",
                                len(changed_genotypes))

            if gtchange_list_filename:
                logger.info("Writing list of changed genotypes to %r",
                            gtchange_list_filename)
                write_changed_genotypes(gtchange_list_filename,
                                        changed_genotypes)

            logger.debug("Chromosome %r finished", chromosome)

    log_time_and_memory_usage(timers, show_phase_vcfs=show_phase_vcfs)