Exemplo n.º 1
0
def run_polyphase(
    phase_input_files,
    variant_file,
    ploidy,
    reference=None,
    output=sys.stdout,
    samples=None,
    chromosomes=None,
    verify_genotypes=False,
    ignore_read_groups=False,
    indels=True,
    mapping_quality=20,
    tag="PS",
    include_haploid_sets=False,
    write_command_line_header=True,
    read_list_filename=None,
    ce_bundle_edges=False,
    min_overlap=2,
    plot_clusters=False,
    plot_threading=False,
    ce_refinements=5,
    block_cut_sensitivity=4,
):
    """
    Run Polyploid Phasing.

    phase_input_files -- list of paths to BAM/CRAM/VCF files
    variant-file -- path to input VCF
    reference -- path to reference FASTA
    output -- path to output VCF or a file like object
    samples -- names of samples to phase. An empty list means: phase all samples
    chromosomes -- names of chromosomes to phase. An empty list means: phase all chromosomes
    ignore_read_groups
    mapping_quality -- discard reads below this mapping quality
    tag -- How to store phasing info in the VCF, can be 'PS' or 'HP'
    write_command_line_header -- whether to add a ##commandline header to the output VCF
    """
    timers = StageTimer()
    logger.info(
        "This is WhatsHap (polyploid) %s running under Python %s",
        __version__,
        platform.python_version(),
    )
    numeric_sample_ids = NumericSampleIds()
    with ExitStack() as stack:
        assert phase_input_files
        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                reference,
                numeric_sample_ids,
                ignore_read_groups,
                indels=indels,
                mapq_threshold=mapping_quality,
            ))
        assert not phased_input_reader.has_vcfs

        if write_command_line_header:
            command_line = "(whatshap {}) {}".format(__version__,
                                                     " ".join(sys.argv[1:]))
        else:
            command_line = None
        try:
            vcf_writer = stack.enter_context(
                PhasedVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=output,
                    tag=tag,
                    ploidy=ploidy,
                    include_haploid_sets=include_haploid_sets,
                ))
        except OSError as e:
            raise CommandLineError(e)

        vcf_reader = stack.enter_context(
            VcfReader(
                variant_file,
                indels=indels,
                phases=True,
                genotype_likelihoods=False,
                ploidy=ploidy,
            ))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        vcf_sample_set = set(vcf_reader.samples)
        for sample in samples:
            if sample not in vcf_sample_set:
                raise CommandLineError(
                    "Sample {!r} requested on command-line not found in VCF".
                    format(sample))

        if block_cut_sensitivity < 0:
            logger.warning(
                "Block cut sensitivity was set to negative value. Lowest value (0) is assumed instead."
            )
            block_cut_sensitivity = 0
        elif block_cut_sensitivity > 5:
            logger.warning(
                "Block cut sensitivity level too large. Assuming highest valid value (5) instead."
            )
            block_cut_sensitivity = 5

        samples = frozenset(samples)

        read_list_file = None
        if read_list_filename:
            raise NotImplementedError("create_read_list_file not implemented")
            # read_list_file = create_read_list_file(read_list_filename)

        # Store phasing parameters in tuple to keep function signatures cleaner
        phasing_param = PhasingParameter(
            ploidy=ploidy,
            verify_genotypes=verify_genotypes,
            ce_bundle_edges=ce_bundle_edges,
            min_overlap=min_overlap,
            ce_refinements=ce_refinements,
            block_cut_sensitivity=block_cut_sensitivity,
            plot_clusters=plot_clusters,
            plot_threading=plot_threading,
        )

        timers.start("parse_vcf")
        try:
            for variant_table in vcf_reader:
                chromosome = variant_table.chromosome
                timers.stop("parse_vcf")
                if (not chromosomes) or (chromosome in chromosomes):
                    logger.info("======== Working on chromosome %r",
                                chromosome)
                else:
                    logger.info(
                        "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                        chromosome,
                    )
                    with timers("write_vcf"):
                        superreads, components = dict(), dict()
                        vcf_writer.write(chromosome, superreads, components)
                    continue

                # These two variables hold the phasing results for all samples
                superreads, components, haploid_components = dict(), dict(
                ), dict()

                # Iterate over all samples to process
                for sample in samples:
                    logger.info("---- Processing individual %s", sample)

                    # Process inputs for this sample
                    missing_genotypes = set()
                    heterozygous = set()

                    genotypes = variant_table.genotypes_of(sample)
                    for index, gt in enumerate(genotypes):
                        if gt.is_none():
                            missing_genotypes.add(index)
                        elif not gt.is_homozygous():
                            heterozygous.add(index)
                        else:
                            assert gt.is_homozygous()
                    to_discard = set(range(
                        len(variant_table))).difference(heterozygous)
                    phasable_variant_table = deepcopy(variant_table)
                    # Remove calls to be discarded from variant table
                    phasable_variant_table.remove_rows_by_index(to_discard)

                    logger.info(
                        "Number of variants skipped due to missing genotypes: %d",
                        len(missing_genotypes),
                    )
                    logger.info(
                        "Number of remaining heterozygous variants: %d",
                        len(phasable_variant_table))

                    # Get the reads belonging to this sample
                    timers.start("read_bam")
                    readset, vcf_source_ids = phased_input_reader.read(
                        chromosome, phasable_variant_table.variants, sample)
                    readset.sort()
                    timers.stop("read_bam")

                    # Verify genotypes
                    if verify_genotypes:
                        timers.start("verify_genotypes")
                        logger.info("Verify genotyping of %s", sample)
                        positions = [
                            v.position for v in phasable_variant_table.variants
                        ]
                        computed_genotypes = [
                            Genotype(gt) for gt in compute_polyploid_genotypes(
                                readset, ploidy, positions)
                        ]
                        # skip all positions at which genotypes do not match
                        given_genotypes = phasable_variant_table.genotypes_of(
                            sample)
                        matching_genotypes = []
                        missing_genotypes = set()
                        print(computed_genotypes, len(computed_genotypes))
                        print(given_genotypes, len(given_genotypes))
                        print(len(positions))
                        for i, g in enumerate(given_genotypes):
                            c_g = computed_genotypes[i]
                            if (g == c_g) or (c_g is None):
                                matching_genotypes.append(g)
                            else:
                                matching_genotypes.append(Genotype([]))
                                missing_genotypes.add(i)
                        phasable_variant_table.set_genotypes_of(
                            sample, matching_genotypes)

                        # Remove variants with deleted genotype
                        phasable_variant_table.remove_rows_by_index(
                            missing_genotypes)
                        logger.info(
                            "Number of variants removed due to inconsistent genotypes: %d",
                            len(missing_genotypes),
                        )
                        logger.info(
                            "Number of remaining heterozygous variants: %d",
                            len(phasable_variant_table),
                        )

                        # Re-read the readset to remove discarded variants
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome, phasable_variant_table.variants,
                            sample)
                        readset.sort()
                        timers.stop("verify_genotypes")

                    # Remove reads with insufficient variants
                    readset = readset.subset([
                        i for i, read in enumerate(readset)
                        if len(read) >= max(2, min_overlap)
                    ])
                    logger.info(
                        "Kept %d reads that cover at least two variants each",
                        len(readset))

                    # Adapt the variant table to the subset of reads
                    phasable_variant_table.subset_rows_by_position(
                        readset.get_positions())

                    # Run the actual phasing
                    (
                        sample_components,
                        sample_haploid_components,
                        sample_superreads,
                    ) = phase_single_individual(readset,
                                                phasable_variant_table, sample,
                                                phasing_param, output, timers)

                    # Collect results
                    components[sample] = sample_components
                    haploid_components[sample] = sample_haploid_components
                    superreads[sample] = sample_superreads

                with timers("write_vcf"):
                    logger.info("======== Writing VCF")
                    vcf_writer.write(
                        chromosome,
                        superreads,
                        components,
                        haploid_components if include_haploid_sets else None,
                    )
                    # TODO: Use genotype information to polish results
                    # assert len(changed_genotypes) == 0
                    logger.info("Done writing VCF")
                logger.debug("Chromosome %r finished", chromosome)
                timers.start("parse_vcf")
            timers.stop("parse_vcf")
        except PloidyError as e:
            raise CommandLineError(e)

    if read_list_file:
        read_list_file.close()

    logger.info("\n== SUMMARY ==")

    log_memory_usage()
    logger.info("Time spent reading BAM/CRAM:                 %6.1f s",
                timers.elapsed("read_bam"))
    logger.info("Time spent parsing VCF:                      %6.1f s",
                timers.elapsed("parse_vcf"))
    if verify_genotypes:
        logger.info(
            "Time spent verifying genotypes:              %6.1f s",
            timers.elapsed("verify_genotypes"),
        )
    logger.info("Time spent detecting blocks:                 %6.1f s",
                timers.elapsed("detecting_blocks"))
    logger.info("Time spent scoring reads:                    %6.1f s",
                timers.elapsed("read_scoring"))
    logger.info(
        "Time spent solving cluster editing:          %6.1f s",
        timers.elapsed("solve_clusterediting"),
    )
    logger.info("Time spent threading haplotypes:             %6.1f s",
                timers.elapsed("threading"))
    if plot_clusters or plot_threading:
        logger.info("Time spent creating plots:                   %6.1f s",
                    timers.elapsed("create_plots"))
    logger.info("Time spent writing VCF:                      %6.1f s",
                timers.elapsed("write_vcf"))
    logger.info("Time spent on rest:                          %6.1f s",
                timers.total() - timers.sum())
    logger.info("Total elapsed time:                          %6.1f s",
                timers.total())
Exemplo n.º 2
0
def run_haplotag(
    variant_file,
    alignment_file,
    output=None,
    reference=None,
    regions=None,
    ignore_linked_read=False,
    given_samples=None,
    linked_read_distance_cutoff=50000,
    ignore_read_groups=False,
    haplotag_list=None,
    tag_supplementary=False,
):

    timers = StageTimer()
    timers.start("haplotag-run")

    with ExitStack() as stack:
        timers.start("haplotag-init")
        try:
            vcf_reader = stack.enter_context(
                VcfReader(variant_file, indels=True, phases=True))
        except OSError as err:
            logger.error("Error while loading variant file {}: {}".format(
                variant_file, err))
            raise err

        use_vcf_samples = compute_variant_file_samples_to_use(
            vcf_reader, given_samples, ignore_read_groups)

        try:
            bam_reader = stack.enter_context(
                pysam.AlignmentFile(alignment_file, "rb", require_index=True))
        except OSError as err:
            logger.error("Error while loading alignment file {}: {}".format(
                alignment_file, err))
            raise err
        # This checks also sample compatibility with VCF
        shared_samples = compute_shared_samples(bam_reader, ignore_read_groups,
                                                use_vcf_samples)

        # Check if user has specified a subset of regions per chromosome
        user_regions = normalize_user_regions(regions, bam_reader.references)

        phased_input_reader = stack.enter_context(
            PhasedInputReader([alignment_file],
                              reference,
                              NumericSampleIds(),
                              ignore_read_groups,
                              indels=False))

        bam_writer = stack.enter_context(
            open_output_alignment_file(
                output,
                reference,
                md5_of(variant_file),
                bam_reader.header.to_dict(),
            ))
        haplotag_writer = stack.enter_context(
            open_haplotag_writer(haplotag_list))

        timers.stop("haplotag-init")
        logger.debug("All input/output files initialized (time: {})".format(
            timers.elapsed("haplotag-init")))
        timers.start("haplotag-process")

        n_alignments = 0
        n_tagged = 0
        n_multiple_phase_sets = 0

        for chrom, regions in user_regions.items():
            logger.debug("Processing chromosome {}".format(chrom))
            variant_table = load_chromosome_variants(vcf_reader, chrom,
                                                     regions)
            if variant_table is not None:
                logger.debug("Preparing haplotype information")

                (BX_tag_to_haplotype, read_to_haplotype,
                 n_mult) = prepare_haplotag_information(
                     variant_table,
                     shared_samples,
                     phased_input_reader,
                     regions,
                     ignore_linked_read,
                     linked_read_distance_cutoff,
                 )
                n_multiple_phase_sets += n_mult
            else:
                # avoid uninitialized variables
                BX_tag_to_haplotype = None
                read_to_haplotype = None

            for start, end in regions:
                logger.debug("Iterating chromosome regions")
                for alignment in bam_reader.fetch(contig=chrom,
                                                  start=start,
                                                  stop=end):
                    n_alignments += 1
                    haplotype_name = "none"
                    phaseset = "none"
                    alignment.set_tag("HP", value=None)
                    alignment.set_tag("PC", value=None)
                    alignment.set_tag("PS", value=None)
                    if variant_table is None or ignore_read(
                            alignment, tag_supplementary):
                        # - If no variants in VCF for this chromosome,
                        # alignments just get written to output
                        # - Ignored reads are simply
                        # written to the output BAM
                        pass
                    else:
                        (is_tagged, haplotype_name,
                         phaseset) = attempt_add_phase_information(
                             alignment,
                             read_to_haplotype,
                             BX_tag_to_haplotype,
                             linked_read_distance_cutoff,
                         )
                        n_tagged += is_tagged

                    bam_writer.write(alignment)
                    if not (alignment.is_secondary
                            or alignment.is_supplementary):
                        print(
                            alignment.query_name,
                            haplotype_name,
                            phaseset,
                            chrom,
                            sep="\t",
                            file=haplotag_writer,
                        )

                    if n_alignments % 100000 == 0:
                        logger.debug("Processed {} alignment records.".format(
                            n_alignments))
        timers.stop("haplotag-process")
        logger.debug("Processing complete (time: {})".format(
            timers.elapsed("haplotag-process")))

    timers.stop("haplotag-run")

    logger.info("\n== SUMMARY ==")
    logger.info("Total alignments processed:              %12d", n_alignments)
    logger.info("Alignments that could be tagged:         %12d", n_tagged)
    logger.info("Alignments spanning multiple phase sets: %12d",
                n_multiple_phase_sets)
    logger.info("haplotag - total processing time: {}".format(
        timers.elapsed("haplotag-run")))
Exemplo n.º 3
0
def run_split(
    reads_file,
    list_file,
    output_h1=None,
    output_h2=None,
    output_untagged=None,
    add_untagged=False,
    pigz_deprecated=False,
    only_largest_block=False,
    discard_unknown_reads=False,
    read_lengths_histogram=None,
):
    if pigz_deprecated:
        logger.warning("Ignoring deprecated --pigz option")
    timers = StageTimer()
    timers.start("split-run")

    with ExitStack() as stack:
        timers.start("split-init")

        # TODO: obviously this won't work for more than two haplotypes
        haplotype_to_int = {"none": 0, "H1": 1, "H2": 2}

        haplo_list, has_haplo_chrom_info, line_parser = check_haplotag_list_information(
            list_file, stack)

        if only_largest_block:
            logger.debug(
                'User selected "--only-largest-block", this requires chromosome '
                "and phaseset information to be present in the haplotag list file."
            )
            if not has_haplo_chrom_info:
                raise ValueError(
                    "The haplotag list file does not contain phaseset and chromosome "
                    "information, which is required to select only reads from the "
                    "largest phased block. Columns 3 and 4 are missing.")

        timers.start("split-process-haplotag-list")

        readname_to_haplotype, known_reads = process_haplotag_list_file(
            haplo_list,
            line_parser,
            haplotype_to_int,
            only_largest_block,
            discard_unknown_reads,
        )
        if discard_unknown_reads:
            logger.debug(
                "User selected to discard unknown reads, i.e., ignore all reads "
                "that are not part of the haplotag list input file.")
            assert (
                len(known_reads) > 0
            ), "No known reads in input set - would discard everything, this is probably wrong"
            missing_reads = len(known_reads)
        else:
            missing_reads = -1

        timers.stop("split-process-haplotag-list")

        input_reader, input_iterator, output_writers = initialize_io_files(
            reads_file,
            output_h1,
            output_h2,
            output_untagged,
            stack,
        )

        timers.stop("split-init")

        histogram_data = {
            0: Counter(),
            1: Counter(),
            2: Counter(),
        }

        # holds count statistics about total processed reads etc.
        read_counter = Counter()

        process_haplotype = {
            0: output_untagged is not None or add_untagged,
            1: output_h1 is not None,
            2: output_h2 is not None,
        }

        timers.start("split-iter-input")

        for read_name, read_length, record in input_iterator(input_reader):
            read_counter["total_reads"] += 1
            if discard_unknown_reads and read_name not in known_reads:
                read_counter["unknown_reads"] += 1
                continue
            read_haplotype = readname_to_haplotype[read_name]
            if not process_haplotype[read_haplotype]:
                read_counter["skipped_reads"] += 1
                continue
            histogram_data[read_haplotype][read_length] += 1
            read_counter[read_haplotype] += 1

            output_writers[read_haplotype].write(record)
            if read_haplotype == 0 and add_untagged:
                output_writers[1].write(record)
                output_writers[2].write(record)

            if discard_unknown_reads:
                missing_reads -= 1
                if missing_reads == 0:
                    logger.info(
                        "All known reads processed - cancel processing...")
                    break

        timers.stop("split-iter-input")

        if read_lengths_histogram is not None:
            timers.start("split-length-histogram")
            write_read_length_histogram(histogram_data, read_lengths_histogram)
            timers.stop("split-length-histogram")

    timers.stop("split-run")

    logger.info("\n== SUMMARY ==")
    logger.info("Total reads processed: {}".format(
        read_counter["total_reads"]))
    logger.info('Number of output reads "untagged": {}'.format(
        read_counter[0]))
    logger.info("Number of output reads haplotype 1: {}".format(
        read_counter[1]))
    logger.info("Number of output reads haplotype 2: {}".format(
        read_counter[2]))
    logger.info("Number of unknown (dropped) reads: {}".format(
        read_counter["unknown_reads"]))
    logger.info("Number of skipped reads (per user request): {}".format(
        read_counter["skipped_reads"]))

    logger.info("Time for processing haplotag list: {} sec".format(
        round(timers.elapsed("split-process-haplotag-list"), 3)))

    logger.info("Time for total initial setup: {} sec".format(
        round(timers.elapsed("split-init"), 3)))

    logger.info("Time for iterating input reads: {} sec".format(
        round(timers.elapsed("split-iter-input"), 3)))

    if read_lengths_histogram is not None:
        logger.info("Time for creating histogram output: {} sec".format(
            round(timers.elapsed("split-length-histogram"), 3)))

    logger.info("Total run time: {} sec".format(
        round(timers.elapsed("split-run"), 3)))