예제 #1
0
def test_readset():
    rs = ReadSet()
    r = Read('Read A', 56)
    r.add_variant(100, 1, 37)
    r.add_variant(101, 0, 18)
    rs.add(r)

    r = Read('Read B', 0)
    r.add_variant(101, 0, 23)
    rs.add(r)

    r = Read('Read C', 17)
    r.add_variant(99, 1, 27)
    r.add_variant(80, 1, 17)
    r[1] = Variant(position=105, allele=0, quality=14)
    rs.add(r)

    assert rs[0].name == 'Read A'
    assert rs[1].name == 'Read B'
    assert rs[2].name == 'Read C'

    rs.sort()

    # should be sorted after finalization
    assert rs[0].name == 'Read C'
    assert rs[1].name == 'Read A'
    assert rs[2].name == 'Read B'

    assert len(rs) == 3

    assert rs.get_positions() == [99, 100, 101, 105]

    r = rs[(0, 'Read A')]
    assert r.name == 'Read A'
    assert r.mapqs == (56, ), str(r.mapqs)

    r = rs[(0, 'Read B')]
    assert r.name == 'Read B'
    assert r.mapqs == (0, )

    r = rs[(0, 'Read C')]
    assert r.name == 'Read C'
    assert r.mapqs == (17, )
    assert len(r) == 2
    assert r[0] == Variant(position=99, allele=1, quality=27)
    assert r[1] == Variant(position=105, allele=0, quality=14)
예제 #2
0
def run_genotype(
    phase_input_files,
    variant_file,
    reference=None,
    output=sys.stdout,
    samples=None,
    chromosomes=None,
    ignore_read_groups=False,
    indels=True,
    mapping_quality=20,
    max_coverage=15,
    nopriors=False,
    ped=None,
    recombrate=1.26,
    genmap=None,
    gt_qual_threshold=0,
    prioroutput=None,
    constant=0.0,
    overhang=10,
    affine_gap=False,
    gap_start=10,
    gap_extend=7,
    mismatch=15,
    write_command_line_header=True,
    use_ped_samples=False,
):
    """
    For now: this function only runs the genotyping algorithm. Genotype likelihoods for
    all variants are computed using the forward backward algorithm
    """
    timers = StageTimer()
    logger.info(
        "This is WhatsHap (genotyping) %s running under Python %s",
        __version__,
        platform.python_version(),
    )
    if write_command_line_header:
        command_line = "(whatshap {}) {}".format(__version__,
                                                 " ".join(sys.argv[1:]))
    else:
        command_line = None
    with ExitStack() as stack:
        # read the given input files (BAMs, VCFs, ref...)
        numeric_sample_ids = NumericSampleIds()
        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                reference,
                numeric_sample_ids,
                ignore_read_groups,
                indels=indels,
                mapq_threshold=mapping_quality,
                overhang=overhang,
                affine=affine_gap,
                gap_start=gap_start,
                gap_extend=gap_extend,
                default_mismatch=mismatch,
            ))
        show_phase_vcfs = phased_input_reader.has_vcfs

        # vcf writer for final genotype likelihoods
        vcf_writer = stack.enter_context(
            GenotypeVcfWriter(command_line=command_line,
                              in_path=variant_file,
                              out_file=output))
        # vcf writer for only the prior likelihoods (if output is desired)
        prior_vcf_writer = None
        if prioroutput is not None:
            prior_vcf_writer = stack.enter_context(
                GenotypeVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=stack.enter_context(open(prioroutput, "w")),
                ))

        # parse vcf with input variants
        # remove all likelihoods that may already be present
        vcf_reader = stack.enter_context(
            VcfReader(
                variant_file,
                indels=indels,
                genotype_likelihoods=False,
                ignore_genotypes=True,
            ))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        # if --use-ped-samples is set, use only samples from PED file
        if ped and use_ped_samples:
            samples = set()
            for trio in PedReader(ped):
                if trio.child is None or trio.mother is None or trio.father is None:
                    continue
                samples.add(trio.mother)
                samples.add(trio.father)
                samples.add(trio.child)

        vcf_sample_set = set(vcf_reader.samples)
        for sample in samples:
            if sample not in vcf_sample_set:
                raise CommandLineError(
                    "Sample {!r} requested on command-line not found in VCF".
                    format(sample))

        if ped and genmap:
            logger.info(
                "Using region-specific recombination rates from genetic map %s.",
                genmap,
            )
            recombination_cost_computer = GeneticMapRecombinationCostComputer(
                genmap)
        else:
            if ped:
                logger.info("Using uniform recombination rate of %g cM/Mb.",
                            recombrate)
            recombination_cost_computer = UniformRecombinationCostComputer(
                recombrate)

        samples = frozenset(samples)
        families, family_trios = setup_families(samples, ped,
                                                numeric_sample_ids,
                                                max_coverage)

        # Read phase information provided as VCF files, if provided.
        with timers("parse_phasing_vcfs"):
            phased_input_reader.read_vcfs()

        # compute genotype likelihood threshold
        gt_prob = 1.0 - (10**(-gt_qual_threshold / 10.0))

        for variant_table in timers.iterate("parse_vcf", vcf_reader):

            # create a mapping of genome positions to indices
            var_to_pos = dict()
            for i in range(len(variant_table.variants)):
                var_to_pos[variant_table.variants[i].position] = i

            chromosome = variant_table.chromosome
            if (not chromosomes) or (chromosome in chromosomes):
                logger.info("======== Working on chromosome %r", chromosome)
            else:
                logger.info(
                    "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                    chromosome,
                )
                vcf_writer.write_genotypes(chromosome,
                                           variant_table,
                                           indels,
                                           leave_unchanged=True)
                if prioroutput is not None:
                    prior_vcf_writer.write_genotypes(chromosome,
                                                     variant_table,
                                                     indels,
                                                     leave_unchanged=True)
                continue

            positions = [v.position for v in variant_table.variants]
            if not nopriors:
                # compute prior genotype likelihoods based on all reads
                for sample in samples:
                    logger.info("---- Initial genotyping of %s", sample)
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            variant_table.variants,
                            sample,
                            read_vcf=False,
                        )
                        readset.sort()
                        genotypes, genotype_likelihoods = compute_genotypes(
                            readset, positions)
                        # recompute genotypes based on given threshold
                        reg_genotype_likelihoods = []
                        for gl in range(len(genotype_likelihoods)):
                            norm_sum = (genotype_likelihoods[gl][0] +
                                        genotype_likelihoods[gl][1] +
                                        genotype_likelihoods[gl][2] +
                                        3 * constant)
                            regularized = PhredGenotypeLikelihoods([
                                (genotype_likelihoods[gl][0] + constant) /
                                norm_sum,
                                (genotype_likelihoods[gl][1] + constant) /
                                norm_sum,
                                (genotype_likelihoods[gl][2] + constant) /
                                norm_sum,
                            ])
                            genotypes[gl] = determine_genotype(
                                regularized, gt_prob)
                            assert isinstance(genotypes[gl], Genotype)
                            reg_genotype_likelihoods.append(regularized)
                        variant_table.set_genotype_likelihoods_of(
                            sample,
                            [
                                PhredGenotypeLikelihoods(list(gl))
                                for gl in reg_genotype_likelihoods
                            ],
                        )
                        variant_table.set_genotypes_of(sample, genotypes)
            else:

                # use uniform genotype likelihoods for all individuals
                for sample in samples:
                    variant_table.set_genotype_likelihoods_of(
                        sample,
                        [PhredGenotypeLikelihoods([1 / 3, 1 / 3, 1 / 3])] *
                        len(positions),
                    )

            # if desired, output the priors in separate vcf
            if prioroutput is not None:
                prior_vcf_writer.write_genotypes(chromosome, variant_table,
                                                 indels)

            # Iterate over all families to process, i.e. a separate DP table is created
            # for each family.
            for representative_sample, family in sorted(families.items()):
                if len(family) == 1:
                    logger.info("---- Processing individual %s",
                                representative_sample)
                else:
                    logger.info("---- Processing family with individuals: %s",
                                ",".join(family))
                max_coverage_per_sample = max(1, max_coverage // len(family))
                logger.info("Using maximum coverage per sample of %dX",
                            max_coverage_per_sample)
                trios = family_trios[representative_sample]
                assert (len(family) == 1) or (len(trios) > 0)

                # Get the reads belonging to each sample
                readsets = dict()
                for sample in family:
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            variant_table.variants,
                            sample,
                        )

                    with timers("select"):
                        readset = readset.subset([
                            i for i, read in enumerate(readset)
                            if len(read) >= 2
                        ])
                        logger.info(
                            "Kept %d reads that cover at least two variants each",
                            len(readset),
                        )
                        selected_reads = select_reads(
                            readset,
                            max_coverage_per_sample,
                            preferred_source_ids=vcf_source_ids,
                        )
                    readsets[sample] = selected_reads

                # Merge reads into one ReadSet (note that each Read object
                # knows the sample it originated from).
                all_reads = ReadSet()
                for sample, readset in readsets.items():
                    for read in readset:
                        assert read.is_sorted(), "Add a read.sort() here"
                        all_reads.add(read)

                all_reads.sort()

                # Determine which variants can (in principle) be phased
                accessible_positions = sorted(all_reads.get_positions())
                logger.info(
                    "Variants covered by at least one phase-informative "
                    "read in at least one individual after read selection: %d",
                    len(accessible_positions),
                )

                # Create Pedigree
                pedigree = Pedigree(numeric_sample_ids)
                for sample in family:
                    # genotypes are assumed to be unknown, so ignore information that
                    # might already be present in the input vcf
                    all_genotype_likelihoods = variant_table.genotype_likelihoods_of(
                        sample)
                    genotype_l = [
                        all_genotype_likelihoods[var_to_pos[a_p]]
                        for a_p in accessible_positions
                    ]
                    pedigree.add_individual(
                        sample,
                        [
                            Genotype([])
                            for i in range(len(accessible_positions))
                        ],
                        genotype_l,
                    )
                for trio in trios:
                    pedigree.add_relationship(
                        father_id=trio.father,
                        mother_id=trio.mother,
                        child_id=trio.child,
                    )

                recombination_costs = recombination_cost_computer.compute(
                    accessible_positions)

                # Finally, run genotyping algorithm
                with timers("genotyping"):
                    problem_name = "genotyping"
                    logger.info(
                        "Genotype %d sample%s by solving the %s problem ...",
                        len(family),
                        "s" if len(family) > 1 else "",
                        problem_name,
                    )
                    forward_backward_table = GenotypeDPTable(
                        numeric_sample_ids,
                        all_reads,
                        recombination_costs,
                        pedigree,
                        accessible_positions,
                    )
                    # store results
                    for s in family:
                        likelihood_list = variant_table.genotype_likelihoods_of(
                            s)
                        genotypes_list = variant_table.genotypes_of(s)

                        for pos in range(len(accessible_positions)):
                            likelihoods = forward_backward_table.get_genotype_likelihoods(
                                s, pos)

                            # compute genotypes from likelihoods and store information
                            geno = determine_genotype(likelihoods, gt_prob)
                            assert isinstance(geno, Genotype)
                            genotypes_list[var_to_pos[
                                accessible_positions[pos]]] = geno
                            likelihood_list[var_to_pos[
                                accessible_positions[pos]]] = likelihoods

                        variant_table.set_genotypes_of(s, genotypes_list)
                        variant_table.set_genotype_likelihoods_of(
                            s, likelihood_list)

            with timers("write_vcf"):
                logger.info("======== Writing VCF")
                vcf_writer.write_genotypes(chromosome, variant_table, indels)
                logger.info("Done writing VCF")

            logger.debug("Chromosome %r finished", chromosome)

    logger.info("\n== SUMMARY ==")
    total_time = timers.total()
    log_memory_usage()
    logger.info(
        "Time spent reading BAM:                      %6.1f s",
        timers.elapsed("read_bam"),
    )
    logger.info(
        "Time spent parsing VCF:                      %6.1f s",
        timers.elapsed("parse_vcf"),
    )
    if show_phase_vcfs:
        logger.info(
            "Time spent parsing input phasings from VCFs: %6.1f s",
            timers.elapsed("parse_phasing_vcfs"),
        )
    logger.info("Time spent selecting reads:                  %6.1f s",
                timers.elapsed("select"))
    logger.info(
        "Time spent genotyping:                          %6.1f s",
        timers.elapsed("genotyping"),
    )
    logger.info(
        "Time spent writing VCF:                      %6.1f s",
        timers.elapsed("write_vcf"),
    )
    logger.info(
        "Time spent on rest:                          %6.1f s",
        total_time - timers.sum(),
    )
    logger.info("Total elapsed time:                          %6.1f s",
                total_time)
예제 #3
0
    def read(self, chromosome, variants, sample, *, read_vcf=True, regions=None):
        """
        Return a pair (readset, vcf_source_ids) where readset is a sorted ReadSet.

        Set read_vcf to False to not read phased blocks from the VCFs
        """
        readset_reader = self._readset_reader
        for_sample = "for sample {!r} ".format(sample) if not self._ignore_read_groups else ""
        logger.info("Reading alignments %sand detecting alleles ...", for_sample)
        try:
            reference = self._fasta[chromosome] if self._fasta else None
        except KeyError:
            raise CommandLineError(
                "Chromosome {!r} present in VCF file, but not in the reference FASTA {!r}".format(
                    chromosome, self._fasta.filename
                )
            )

        bam_sample = None if self._ignore_read_groups else sample
        try:
            readset = readset_reader.read(chromosome, variants, bam_sample, reference, regions)
        except SampleNotFoundError:
            logger.warning("Sample %r not found in any BAM/CRAM file.", bam_sample)
            readset = ReadSet()
        except ReadSetError as e:
            raise CommandLineError(e)
        except ReferenceNotFoundError:
            if chromosome.startswith("chr"):
                alternative = chromosome[3:]
            else:
                alternative = "chr" + chromosome
            message = "The chromosome {!r} was not found in the BAM/CRAM file.".format(chromosome)
            if readset_reader.has_reference(alternative):
                message += " Found {!r} instead".format(alternative)
            raise CommandLineError(message)

        vcf_source_ids = set()
        if read_vcf:
            # TODO this is a bit clumsy
            if self._vcfs is None:
                raise ValueError("call PhasedInputReader.read_vcfs() first")
            # Add phasing information from VCF files, if present
            sample_id = self._numeric_sample_ids[sample]
            for i, vcf in enumerate(self._vcfs):
                if chromosome in vcf:
                    variant_table = vcf[chromosome]
                    source_id = readset_reader.n_paths + i
                    vcf_source_ids.add(source_id)
                    for read in variant_table.phased_blocks_as_reads(
                        sample, variants, source_id, sample_id
                    ):
                        readset.add(read)

        # TODO is this necessary?
        for read in readset:
            read.sort()
        readset.sort()

        logger.info(
            "Found %d reads covering %d variants", len(readset), len(readset.get_positions()),
        )
        return readset, vcf_source_ids