def import_and_transform_gvcf(path):
    size = vc_all.CombinerConfig.default_exome_interval_size
    intervals = vc_all.calculate_even_genome_partitioning('GRCh38', size)

    [mt] = hl.import_gvcfs([path], intervals, reference_genome='GRCh38')
    mt = vc_all.transform_gvcf(mt)
    mt._force_count()
示例#2
0
def test_combiner_works():
    _paths = ['gvcfs/HG00096.g.vcf.gz', 'gvcfs/HG00268.g.vcf.gz']
    paths = [resource(p) for p in _paths]
    parts = [
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 17821257, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 18708366, reference_genome='GRCh38')),
                    includes_end=True),
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 18708367, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 19776611, reference_genome='GRCh38')),
                    includes_end=True),
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 19776612, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 21144633, reference_genome='GRCh38')),
                    includes_end=True)
    ]
    vcfs = hl.import_gvcfs(paths, parts, reference_genome='GRCh38', array_elements_required=False)
    entry_to_keep = defined_entry_fields(vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) - {'GT', 'PGT', 'PL'}
    vcfs = [transform_gvcf(mt.annotate_rows(info=mt.info.annotate(
        MQ_DP=hl.missing(hl.tint32),
        VarDP=hl.missing(hl.tint32),
        QUALapprox=hl.missing(hl.tint32))),
                           reference_entry_fields_to_keep=entry_to_keep)
            for mt in vcfs]
    comb = combine_variant_datasets(vcfs)
    assert len(parts) == comb.variant_data.n_partitions()
    comb.variant_data._force_count_rows()
    comb.reference_data._force_count_rows()
示例#3
0
def h(paths, sample_names, tmp_path, json, header, out_path, i, first):
    """inner part of stage one, including transformation from a gvcf into the combiner's format"""
    vcfs = [
        comb.transform_one(vcf)
        for vcf in hl.import_gvcfs(paths,
                                   json,
                                   array_elements_required=False,
                                   _external_header=header,
                                   _external_sample_ids=sample_names)
    ]
    combined = [
        comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINE_NUMBER)
    ]
    if first and len(
            paths
    ) <= MAX_COMBINE_NUMBER:  # only 1 item, just write it, unless we have already written other items
        combined[0].write(out_path, overwrite=True)
        return []
    pad = len(str(len(combined)))
    hl.experimental.write_matrix_tables(combined,
                                        tmp_path + f'{i}/',
                                        overwrite=True)
    return [
        tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt'
        for n in range(len(combined))
    ]
示例#4
0
def test_vcf_vds_combiner_equivalence():
    import hail.experimental.vcf_combiner.vcf_combiner as vcf
    import hail.vds.combiner as vds
    _paths = ['gvcfs/HG00096.g.vcf.gz', 'gvcfs/HG00268.g.vcf.gz']
    paths = [resource(p) for p in _paths]
    parts = [
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 17821257, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 18708366, reference_genome='GRCh38')),
                    includes_end=True),
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 18708367, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 19776611, reference_genome='GRCh38')),
                    includes_end=True),
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 19776612, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 21144633, reference_genome='GRCh38')),
                    includes_end=True)
    ]
    vcfs = [mt.annotate_rows(info=mt.info.annotate(
        MQ_DP=hl.missing(hl.tint32),
        VarDP=hl.missing(hl.tint32),
        QUALapprox=hl.missing(hl.tint32)))
            for mt in hl.import_gvcfs(paths, parts, reference_genome='GRCh38',
                                      array_elements_required=False)]
    entry_to_keep = defined_entry_fields(vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) - {'GT', 'PGT', 'PL'}
    vds = vds.combine_variant_datasets([vds.transform_gvcf(mt, reference_entry_fields_to_keep=entry_to_keep) for mt in vcfs])
    smt = vcf.combine_gvcfs([vcf.transform_gvcf(mt) for mt in vcfs])
    smt_from_vds = hl.vds.to_merged_sparse_mt(vds).drop('RGQ')
    smt = smt.select_entries(*smt_from_vds.entry)  # harmonize fields and order
    smt = smt.key_rows_by('locus', 'alleles')
    assert smt._same(smt_from_vds)
示例#5
0
 def h(paths, sample_names, tmp_path, intervals, header, out_path, i,
       first):
     vcfs = [
         transform_gvcf(vcf)
         for vcf in hl.import_gvcfs(paths,
                                    intervals,
                                    array_elements_required=False,
                                    _external_header=header,
                                    _external_sample_ids=sample_names
                                    if header is not None else None)
     ]
     combined = [
         combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINE_NUMBER)
     ]
     if first and len(
             paths
     ) <= MAX_COMBINE_NUMBER:  # only 1 item, just write it, unless we have already written other items
         combined[0].write(out_path, overwrite=True)
         return []
     pad = len(str(len(combined)))
     hl.experimental.write_matrix_tables(combined,
                                         tmp_path + f'{i}/',
                                         overwrite=True)
     return [
         tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt'
         for n in range(len(combined))
     ]
示例#6
0
def generate_5_sample_vds():
    paths = [
        os.path.join(resource('gvcfs'), '1kg_chr22', path) for path in [
            'HG00187.hg38.g.vcf.gz', 'HG00190.hg38.g.vcf.gz',
            'HG00308.hg38.g.vcf.gz', 'HG00313.hg38.g.vcf.gz',
            'HG00320.hg38.g.vcf.gz'
        ]
    ]
    parts = [
        hl.Interval(start=hl.Struct(
            locus=hl.Locus('chr22', 1, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus(
                        'chr22',
                        hl.get_reference('GRCh38').contig_length('chr22') - 1,
                        reference_genome='GRCh38')),
                    includes_end=True)
    ]
    vcfs = hl.import_gvcfs(paths,
                           parts,
                           reference_genome='GRCh38',
                           array_elements_required=False)
    to_keep = defined_entry_fields(
        vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000)
    vds = hl.vds.combiner.combine_variant_datasets(
        [hl.vds.combiner.transform_gvcf(mt, to_keep) for mt in vcfs])
    vds.variant_data = vds.variant_data._key_rows_by_assert_sorted(
        'locus', 'alleles')
    vds.write(os.path.join(resource('vds'), '1kg_chr22_5_samples.vds'),
              overwrite=True)
示例#7
0
def test_gvcf_subset_same_as_import_vcf():
    path = os.path.join(resource('gvcfs'), 'subset', f'HG00187.hg38.g.vcf.gz')
    [mt] = hl.import_gvcfs([path],
                           default_exome_intervals('GRCh38'),
                           reference_genome='GRCh38')
    assert mt._same(
        hl.import_vcf(path, force_bgz=True,
                      reference_genome='GRCh38').key_rows_by('locus'))
示例#8
0
    def _step_gvcfs(self):
        step = self.branch_factor
        files_to_merge = self.gvcfs[:self.gvcf_batch_size * step]
        self.gvcfs = self.gvcfs[self.gvcf_batch_size * step:]

        info(
            f'GVCF combine (job {self._job_id}): merging {len(files_to_merge)} GVCFs into '
            f'{(len(files_to_merge) + step - 1) // step} datasets')

        if self.gvcf_external_header is not None:
            sample_names = self.gvcf_sample_names[:self.gvcf_batch_size * step]
            self.gvcf_sample_names = self.gvcf_sample_names[self.
                                                            gvcf_batch_size *
                                                            step:]
        else:
            sample_names = None
        merge_vds = []
        merge_n_samples = []
        vcfs = [
            transform_gvcf(vcf,
                           reference_entry_fields_to_keep=self.
                           gvcf_reference_entry_fields_to_keep,
                           info_to_keep=self.gvcf_info_to_keep)
            for vcf in hl.import_gvcfs(
                files_to_merge,
                self.gvcf_import_intervals,
                array_elements_required=False,
                _external_header=self.gvcf_external_header,
                _external_sample_ids=[[name] for name in sample_names]
                if sample_names is not None else None,
                reference_genome=self.reference_genome,
                contig_recoding=self.contig_recoding)
        ]
        while vcfs:
            merging, vcfs = vcfs[:step], vcfs[step:]
            merge_vds.append(combine_variant_datasets(merging))
            merge_n_samples.append(len(merging))
        if self.finished and len(merge_vds) == 1:
            merge_vds[0].write(self.output_path)
            return

        temp_path = self._temp_out_path(
            f'gvcf-combine_job{self._job_id}/dataset_')
        pad = len(str(len(merge_vds) - 1))
        merge_metadata = [
            VDSMetadata(path=temp_path + str(count).rjust(pad, '0') + '.vds',
                        n_samples=n_samples)
            for count, n_samples in enumerate(merge_n_samples)
        ]
        paths = [md.path for md in merge_metadata]
        hl.vds.write_variant_datasets(merge_vds,
                                      paths,
                                      overwrite=True,
                                      codec_spec=FAST_CODEC_SPEC)
        for md in merge_metadata:
            self.vdses[max(1, floor(log(md.n_samples,
                                        self.branch_factor)))].append(md)
示例#9
0
def test_gvcfs(spark, tmp_path):
    # GVCF MatrixTables are not keyed by locus and alleles, just by locus
    input_vcf = 'test-data/tabix-test-vcf/combined.chr20_18210071_18210093.g.vcf.gz'
    partitions = [
        hl.Interval(hl.Locus("chr20", 1, reference_genome='GRCh38'),
                    hl.Locus("chr20", 20000000, reference_genome='GRCh38'),
                    includes_end=True)
    ]
    hail_df = functions.from_matrix_table(
        hl.import_gvcfs([input_vcf],
                        partitions,
                        force_bgz=True,
                        reference_genome='GRCh38')[0])
    _assert_lossless_adapter(spark, tmp_path, hail_df, input_vcf, 'vcf',
                             'bigvcf')
示例#10
0
def run_combiner(sample_paths: List[str],
                 out_file: str,
                 tmp_path: str,
                 intervals: Optional[List[hl.utils.Interval]] = None,
                 header: Optional[str] = None,
                 sample_names: Optional[List[str]] = None,
                 branch_factor: int = CombinerConfig.default_branch_factor,
                 batch_size: int = CombinerConfig.default_batch_size,
                 target_records: int = CombinerConfig.default_target_records,
                 overwrite: bool = False,
                 reference_genome: str = 'default',
                 contig_recoding: Optional[Dict[str, str]] = None,
                 key_by_locus_and_alleles: bool = False):
    """Run the Hail VCF combiner, performing a hierarchical merge to create a combined sparse matrix table.

    Parameters
    ----------
    sample_paths : :obj:`list` of :obj:`str`
        Paths to individual GVCFs.
    out_file : :obj:`str`
        Path to final combined matrix table.
    tmp_path : :obj:`str`
        Path for intermediate output.
    intervals : list of :class:`.Interval` or None
        Partitioning with which to import GVCFs in first phase of combiner.
    header : :obj:`str` or None
        External header file to use as GVCF header for all inputs. If defined, `sample_names` must be defined as well.
    sample_names: list of :obj:`str` or None
        Sample names, to be used with `header`.
    branch_factor : :obj:`int`
        Combiner branch factor.
    batch_size : :obj:`int`
        Combiner batch size.
    target_records : :obj:`int`
        Target records per partition in each combiner phase after the first.
    overwrite : :obj:`bool`
        Overwrite output file, if it exists.
    reference_genome : :obj:`str`
        Reference genome for GVCF import.
    contig_recoding: :obj:`dict` of (:obj:`str`, :obj:`str`), optional
        Mapping from contig name in gVCFs to contig name the reference
        genome.  All contigs must be present in the
        `reference_genome`, so this is useful for mapping
        differently-formatted data onto known references.
    key_by_locus_and_alleles : :obj:`bool`
        Key by both locus and alleles in the final output.

    Returns
    -------
    None

    """
    tmp_path += f'/combiner-temporary/{uuid.uuid4()}/'
    if header is not None:
        assert sample_names is not None
        assert len(sample_names) == len(sample_paths)

    # FIXME: this should be hl.default_reference().even_intervals_contig_boundary
    intervals = intervals or default_exome_intervals(reference_genome)

    config = CombinerConfig(branch_factor=branch_factor,
                            batch_size=batch_size,
                            target_records=target_records)
    plan = config.plan(len(sample_paths))

    files_to_merge = sample_paths
    n_phases = len(plan.phases)
    total_ops = len(files_to_merge) * n_phases
    total_work_done = 0
    for phase_i, phase in enumerate(plan.phases):
        phase_i += 1  # used for info messages, 1-indexed for readability

        n_jobs = len(phase.jobs)
        merge_str = 'input GVCFs' if phase_i == 1 else 'intermediate sparse matrix tables'
        job_str = hl.utils.misc.plural('job', n_jobs)
        info(
            f"Starting phase {phase_i}/{n_phases}, merging {len(files_to_merge)} {merge_str} in {n_jobs} {job_str}."
        )

        if phase_i > 1:
            intervals = calculate_new_intervals(
                hl.read_matrix_table(files_to_merge[0]).rows(),
                config.target_records,
                reference_genome=reference_genome)

        new_files_to_merge = []

        for job_i, job in enumerate(phase.jobs):
            job_i += 1  # used for info messages, 1-indexed for readability

            n_merges = len(job.merges)
            merge_str = hl.utils.misc.plural('file', n_merges)
            pct_total = 100 * job.input_total_size / total_ops
            info(
                f"Starting phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)} to create {n_merges} merged {merge_str}, corresponding to ~{pct_total:.1f}% of total I/O."
            )
            merge_mts: List[MatrixTable] = []
            for merge in job.merges:
                inputs = [files_to_merge[i] for i in merge.inputs]

                if phase_i == 1:
                    mts = [
                        transform_gvcf(vcf) for vcf in hl.import_gvcfs(
                            inputs,
                            intervals,
                            array_elements_required=False,
                            _external_header=header,
                            _external_sample_ids=[
                                sample_names[i] for i in merge.inputs
                            ] if header is not None else None,
                            reference_genome=reference_genome,
                            contig_recoding=contig_recoding)
                    ]
                else:
                    mts = [
                        hl.read_matrix_table(path, _intervals=intervals)
                        for path in inputs
                    ]

                merge_mts.append(combine_gvcfs(mts))

            if phase_i == n_phases:  # final merge!
                assert n_jobs == 1
                assert len(merge_mts) == 1
                [final_mt] = merge_mts

                if key_by_locus_and_alleles:
                    final_mt = MatrixTable(
                        MatrixKeyRowsBy(final_mt._mir, ['locus', 'alleles'],
                                        is_sorted=True))
                final_mt.write(out_file, overwrite=overwrite)
                new_files_to_merge = [out_file]
                info(
                    f"Finished phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, 100% of total I/O finished."
                )
                break

            tmp = f'{tmp_path}_phase{phase_i}_job{job_i}/'
            hl.experimental.write_matrix_tables(merge_mts, tmp, overwrite=True)
            pad = len(str(len(merge_mts)))
            new_files_to_merge.extend(tmp + str(n).zfill(pad) + '.mt'
                                      for n in range(len(merge_mts)))
            total_work_done += job.input_total_size
            info(
                f"Finished {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, {100 * total_work_done / total_ops:.1f}% of total I/O finished."
            )

        info(f"Finished phase {phase_i}/{n_phases}.")

        files_to_merge = new_files_to_merge

    assert files_to_merge == [out_file]

    info("Finished!")
示例#11
0
import hail as hl

gvcfs = ['gs://hail-common/test-resources/HG00096.g.vcf.gz',
         'gs://hail-common/test-resources/HG00268.g.vcf.gz']
hl.init(default_reference='GRCh38')
parts_json = [
    {'start': {'locus': {'contig': 'chr20', 'position': 17821257}},
     'end': {'locus': {'contig': 'chr20', 'position': 18708366}},
     'includeStart': True,
     'includeEnd': True},
    {'start': {'locus': {'contig': 'chr20', 'position': 18708367}},
     'end': {'locus': {'contig': 'chr20', 'position': 19776611}},
     'includeStart': True,
     'includeEnd': True},
    {'start': {'locus': {'contig': 'chr20', 'position': 19776612}},
     'end': {'locus': {'contig': 'chr20', 'position': 21144633}},
     'includeStart': True,
     'includeEnd': True},
]

parts = hl.tarray(hl.tinterval(hl.tstruct(locus=hl.tlocus('GRCh38'))))._convert_from_json(parts_json)
for mt in hl.import_gvcfs(gvcfs, parts):
    mt._force_count_rows()
示例#12
0
def import_gvcf_force_count(path):
    intervals = vc_all.default_exome_intervals('GRCh38')
    [mt] = hl.import_gvcfs([path], intervals, reference_genome='GRCh38')
    mt._force_count_rows()
示例#13
0
def import_and_transform_gvcf(path):
    intervals = vc_all.default_exome_intervals('GRCh38')
    [mt] = hl.import_gvcfs([path], intervals, reference_genome='GRCh38')
    mt = vc_all.transform_gvcf(mt)
    mt._force_count()
示例#14
0
def run_combiner(sample_paths: List[str],
                 out_file: str,
                 tmp_path: str,
                 *,
                 intervals: Optional[List[hl.utils.Interval]] = None,
                 import_interval_size: Optional[int] = None,
                 use_genome_default_intervals: bool = False,
                 use_exome_default_intervals: bool = False,
                 header: Optional[str] = None,
                 sample_names: Optional[List[str]] = None,
                 branch_factor: int = CombinerConfig.default_branch_factor,
                 batch_size: int = CombinerConfig.default_batch_size,
                 target_records: int = CombinerConfig.default_target_records,
                 overwrite: bool = False,
                 reference_genome: str = 'default',
                 contig_recoding: Optional[Dict[str, str]] = None,
                 key_by_locus_and_alleles: bool = False):
    """Run the Hail VCF combiner, performing a hierarchical merge to create a combined sparse matrix table.

    **Partitioning**

    The partitioning of input GVCFs, which determines the maximum parallelism per file,
    is determined the four parameters below. One of these parameters must be passed to
    this function.

    - `intervals` -- User-supplied intervals.
    - `import_interval_size` -- Use intervals of this uniform size across the genome.
    - `use_genome_default_intervals` -- Use intervals of typical uniform size for whole
      genome GVCFs.
    - `use_exome_default_intervals` -- Use intervals of typical uniform size for exome
      GVCFs.

    It is recommended that new users include either `use_genome_default_intervals` or
    `use_exome_default_intervals`.

    Note also that the partitioning of the final, combined matrix table does not depend
    the GVCF input partitioning.

    Parameters
    ----------
    sample_paths : :obj:`list` of :class:`str`
        Paths to individual GVCFs.
    out_file : :class:`str`
        Path to final combined matrix table.
    tmp_path : :class:`str`
        Path for intermediate output.
    intervals : list of :class:`.Interval` or None
        Import GVCFs with specified partition intervals.
    import_interval_size : :obj:`int` or None
        Import GVCFs with uniform partition intervals of specified size.
    use_genome_default_intervals : :obj:`bool`
        Import GVCFs with uniform partition intervals of default size for
        whole-genome data.
    use_exome_default_intervals : :obj:`bool`
        Import GVCFs with uniform partition intervals of default size for
        exome data.
    header : :class:`str` or None
        External header file to use as GVCF header for all inputs. If defined, `sample_names` must be defined as well.
    sample_names: list of :class:`str` or None
        Sample names, to be used with `header`.
    branch_factor : :obj:`int`
        Combiner branch factor.
    batch_size : :obj:`int`
        Combiner batch size.
    target_records : :obj:`int`
        Target records per partition in each combiner phase after the first.
    overwrite : :obj:`bool`
        Overwrite output file, if it exists.
    reference_genome : :class:`str`
        Reference genome for GVCF import.
    contig_recoding: :obj:`dict` of (:class:`str`, :obj:`str`), optional
        Mapping from contig name in gVCFs to contig name the reference
        genome.  All contigs must be present in the
        `reference_genome`, so this is useful for mapping
        differently-formatted data onto known references.
    key_by_locus_and_alleles : :obj:`bool`
        Key by both locus and alleles in the final output.

    Returns
    -------
    None

    """
    tmp_path += f'/combiner-temporary/{uuid.uuid4()}/'
    if header is not None:
        assert sample_names is not None
        assert len(sample_names) == len(sample_paths)

    n_partition_args = (int(intervals is not None) +
                        int(import_interval_size is not None) +
                        int(use_genome_default_intervals) +
                        int(use_exome_default_intervals))

    if n_partition_args == 0:
        raise ValueError(
            "'run_combiner': require one argument from 'intervals', 'import_interval_size', "
            "'use_genome_default_intervals', or 'use_exome_default_intervals' to choose GVCF partitioning"
        )
    if n_partition_args > 1:
        warning(
            "'run_combiner': multiple colliding arguments found from 'intervals', 'import_interval_size', "
            "'use_genome_default_intervals', or 'use_exome_default_intervals'."
            "\n  The argument found first in the list in this warning will be used, and others ignored."
        )

    if intervals is not None:
        info(
            f"Using {len(intervals)} user-supplied intervals as partitioning for GVCF import"
        )
    elif import_interval_size is not None:
        intervals = calculate_even_genome_partitioning(reference_genome,
                                                       import_interval_size)
        info(f"Using {len(intervals)} intervals with user-supplied size"
             f" {import_interval_size} as partitioning for GVCF import")
    elif use_genome_default_intervals:
        size = CombinerConfig.default_genome_interval_size
        intervals = calculate_even_genome_partitioning(reference_genome, size)
        info(f"Using {len(intervals)} intervals with default whole-genome size"
             f" {size} as partitioning for GVCF import")
    elif use_exome_default_intervals:
        size = CombinerConfig.default_exome_interval_size
        intervals = calculate_even_genome_partitioning(reference_genome, size)
        info(f"Using {len(intervals)} intervals with default exome size"
             f" {size} as partitioning for GVCF import")

    assert intervals is not None

    config = CombinerConfig(branch_factor=branch_factor,
                            batch_size=batch_size,
                            target_records=target_records)
    plan = config.plan(len(sample_paths))

    files_to_merge = sample_paths
    n_phases = len(plan.phases)
    total_ops = len(files_to_merge) * n_phases
    total_work_done = 0
    for phase_i, phase in enumerate(plan.phases):
        phase_i += 1  # used for info messages, 1-indexed for readability

        n_jobs = len(phase.jobs)
        merge_str = 'input GVCFs' if phase_i == 1 else 'intermediate sparse matrix tables'
        job_str = hl.utils.misc.plural('job', n_jobs)
        info(
            f"Starting phase {phase_i}/{n_phases}, merging {len(files_to_merge)} {merge_str} in {n_jobs} {job_str}."
        )

        if phase_i > 1:
            intervals = calculate_new_intervals(
                hl.read_matrix_table(files_to_merge[0]).rows(),
                config.target_records,
                reference_genome=reference_genome)

        new_files_to_merge = []

        for job_i, job in enumerate(phase.jobs):
            job_i += 1  # used for info messages, 1-indexed for readability

            n_merges = len(job.merges)
            merge_str = hl.utils.misc.plural('file', n_merges)
            pct_total = 100 * job.input_total_size / total_ops
            info(
                f"Starting phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)} to create {n_merges} merged {merge_str}, corresponding to ~{pct_total:.1f}% of total I/O."
            )
            merge_mts: List[MatrixTable] = []
            for merge in job.merges:
                inputs = [files_to_merge[i] for i in merge.inputs]

                if phase_i == 1:
                    mts = [
                        transform_gvcf(vcf) for vcf in hl.import_gvcfs(
                            inputs,
                            intervals,
                            array_elements_required=False,
                            _external_header=header,
                            _external_sample_ids=[[sample_names[i]]
                                                  for i in merge.inputs]
                            if header is not None else None,
                            reference_genome=reference_genome,
                            contig_recoding=contig_recoding)
                    ]
                else:
                    mts = [
                        hl.read_matrix_table(path, _intervals=intervals)
                        for path in inputs
                    ]

                merge_mts.append(combine_gvcfs(mts))

            if phase_i == n_phases:  # final merge!
                assert n_jobs == 1
                assert len(merge_mts) == 1
                [final_mt] = merge_mts

                if key_by_locus_and_alleles:
                    final_mt = MatrixTable(
                        MatrixKeyRowsBy(final_mt._mir, ['locus', 'alleles'],
                                        is_sorted=True))
                final_mt.write(out_file, overwrite=overwrite)
                new_files_to_merge = [out_file]
                info(
                    f"Finished phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, 100% of total I/O finished."
                )
                break

            tmp = f'{tmp_path}_phase{phase_i}_job{job_i}/'
            hl.experimental.write_matrix_tables(merge_mts, tmp, overwrite=True)
            pad = len(str(len(merge_mts)))
            new_files_to_merge.extend(tmp + str(n).zfill(pad) + '.mt'
                                      for n in range(len(merge_mts)))
            total_work_done += job.input_total_size
            info(
                f"Finished {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, {100 * total_work_done / total_ops:.1f}% of total I/O finished."
            )

        info(f"Finished phase {phase_i}/{n_phases}.")

        files_to_merge = new_files_to_merge

    assert files_to_merge == [out_file]

    info("Finished!")