def import_and_transform_gvcf(path): size = vc_all.CombinerConfig.default_exome_interval_size intervals = vc_all.calculate_even_genome_partitioning('GRCh38', size) [mt] = hl.import_gvcfs([path], intervals, reference_genome='GRCh38') mt = vc_all.transform_gvcf(mt) mt._force_count()
def test_combiner_works(): _paths = ['gvcfs/HG00096.g.vcf.gz', 'gvcfs/HG00268.g.vcf.gz'] paths = [resource(p) for p in _paths] parts = [ hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 17821257, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 18708366, reference_genome='GRCh38')), includes_end=True), hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 18708367, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 19776611, reference_genome='GRCh38')), includes_end=True), hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 19776612, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 21144633, reference_genome='GRCh38')), includes_end=True) ] vcfs = hl.import_gvcfs(paths, parts, reference_genome='GRCh38', array_elements_required=False) entry_to_keep = defined_entry_fields(vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) - {'GT', 'PGT', 'PL'} vcfs = [transform_gvcf(mt.annotate_rows(info=mt.info.annotate( MQ_DP=hl.missing(hl.tint32), VarDP=hl.missing(hl.tint32), QUALapprox=hl.missing(hl.tint32))), reference_entry_fields_to_keep=entry_to_keep) for mt in vcfs] comb = combine_variant_datasets(vcfs) assert len(parts) == comb.variant_data.n_partitions() comb.variant_data._force_count_rows() comb.reference_data._force_count_rows()
def h(paths, sample_names, tmp_path, json, header, out_path, i, first): """inner part of stage one, including transformation from a gvcf into the combiner's format""" vcfs = [ comb.transform_one(vcf) for vcf in hl.import_gvcfs(paths, json, array_elements_required=False, _external_header=header, _external_sample_ids=sample_names) ] combined = [ comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINE_NUMBER) ] if first and len( paths ) <= MAX_COMBINE_NUMBER: # only 1 item, just write it, unless we have already written other items combined[0].write(out_path, overwrite=True) return [] pad = len(str(len(combined))) hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True) return [ tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined)) ]
def test_vcf_vds_combiner_equivalence(): import hail.experimental.vcf_combiner.vcf_combiner as vcf import hail.vds.combiner as vds _paths = ['gvcfs/HG00096.g.vcf.gz', 'gvcfs/HG00268.g.vcf.gz'] paths = [resource(p) for p in _paths] parts = [ hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 17821257, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 18708366, reference_genome='GRCh38')), includes_end=True), hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 18708367, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 19776611, reference_genome='GRCh38')), includes_end=True), hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 19776612, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 21144633, reference_genome='GRCh38')), includes_end=True) ] vcfs = [mt.annotate_rows(info=mt.info.annotate( MQ_DP=hl.missing(hl.tint32), VarDP=hl.missing(hl.tint32), QUALapprox=hl.missing(hl.tint32))) for mt in hl.import_gvcfs(paths, parts, reference_genome='GRCh38', array_elements_required=False)] entry_to_keep = defined_entry_fields(vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) - {'GT', 'PGT', 'PL'} vds = vds.combine_variant_datasets([vds.transform_gvcf(mt, reference_entry_fields_to_keep=entry_to_keep) for mt in vcfs]) smt = vcf.combine_gvcfs([vcf.transform_gvcf(mt) for mt in vcfs]) smt_from_vds = hl.vds.to_merged_sparse_mt(vds).drop('RGQ') smt = smt.select_entries(*smt_from_vds.entry) # harmonize fields and order smt = smt.key_rows_by('locus', 'alleles') assert smt._same(smt_from_vds)
def h(paths, sample_names, tmp_path, intervals, header, out_path, i, first): vcfs = [ transform_gvcf(vcf) for vcf in hl.import_gvcfs(paths, intervals, array_elements_required=False, _external_header=header, _external_sample_ids=sample_names if header is not None else None) ] combined = [ combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINE_NUMBER) ] if first and len( paths ) <= MAX_COMBINE_NUMBER: # only 1 item, just write it, unless we have already written other items combined[0].write(out_path, overwrite=True) return [] pad = len(str(len(combined))) hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True) return [ tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined)) ]
def generate_5_sample_vds(): paths = [ os.path.join(resource('gvcfs'), '1kg_chr22', path) for path in [ 'HG00187.hg38.g.vcf.gz', 'HG00190.hg38.g.vcf.gz', 'HG00308.hg38.g.vcf.gz', 'HG00313.hg38.g.vcf.gz', 'HG00320.hg38.g.vcf.gz' ] ] parts = [ hl.Interval(start=hl.Struct( locus=hl.Locus('chr22', 1, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus( 'chr22', hl.get_reference('GRCh38').contig_length('chr22') - 1, reference_genome='GRCh38')), includes_end=True) ] vcfs = hl.import_gvcfs(paths, parts, reference_genome='GRCh38', array_elements_required=False) to_keep = defined_entry_fields( vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) vds = hl.vds.combiner.combine_variant_datasets( [hl.vds.combiner.transform_gvcf(mt, to_keep) for mt in vcfs]) vds.variant_data = vds.variant_data._key_rows_by_assert_sorted( 'locus', 'alleles') vds.write(os.path.join(resource('vds'), '1kg_chr22_5_samples.vds'), overwrite=True)
def test_gvcf_subset_same_as_import_vcf(): path = os.path.join(resource('gvcfs'), 'subset', f'HG00187.hg38.g.vcf.gz') [mt] = hl.import_gvcfs([path], default_exome_intervals('GRCh38'), reference_genome='GRCh38') assert mt._same( hl.import_vcf(path, force_bgz=True, reference_genome='GRCh38').key_rows_by('locus'))
def _step_gvcfs(self): step = self.branch_factor files_to_merge = self.gvcfs[:self.gvcf_batch_size * step] self.gvcfs = self.gvcfs[self.gvcf_batch_size * step:] info( f'GVCF combine (job {self._job_id}): merging {len(files_to_merge)} GVCFs into ' f'{(len(files_to_merge) + step - 1) // step} datasets') if self.gvcf_external_header is not None: sample_names = self.gvcf_sample_names[:self.gvcf_batch_size * step] self.gvcf_sample_names = self.gvcf_sample_names[self. gvcf_batch_size * step:] else: sample_names = None merge_vds = [] merge_n_samples = [] vcfs = [ transform_gvcf(vcf, reference_entry_fields_to_keep=self. gvcf_reference_entry_fields_to_keep, info_to_keep=self.gvcf_info_to_keep) for vcf in hl.import_gvcfs( files_to_merge, self.gvcf_import_intervals, array_elements_required=False, _external_header=self.gvcf_external_header, _external_sample_ids=[[name] for name in sample_names] if sample_names is not None else None, reference_genome=self.reference_genome, contig_recoding=self.contig_recoding) ] while vcfs: merging, vcfs = vcfs[:step], vcfs[step:] merge_vds.append(combine_variant_datasets(merging)) merge_n_samples.append(len(merging)) if self.finished and len(merge_vds) == 1: merge_vds[0].write(self.output_path) return temp_path = self._temp_out_path( f'gvcf-combine_job{self._job_id}/dataset_') pad = len(str(len(merge_vds) - 1)) merge_metadata = [ VDSMetadata(path=temp_path + str(count).rjust(pad, '0') + '.vds', n_samples=n_samples) for count, n_samples in enumerate(merge_n_samples) ] paths = [md.path for md in merge_metadata] hl.vds.write_variant_datasets(merge_vds, paths, overwrite=True, codec_spec=FAST_CODEC_SPEC) for md in merge_metadata: self.vdses[max(1, floor(log(md.n_samples, self.branch_factor)))].append(md)
def test_gvcfs(spark, tmp_path): # GVCF MatrixTables are not keyed by locus and alleles, just by locus input_vcf = 'test-data/tabix-test-vcf/combined.chr20_18210071_18210093.g.vcf.gz' partitions = [ hl.Interval(hl.Locus("chr20", 1, reference_genome='GRCh38'), hl.Locus("chr20", 20000000, reference_genome='GRCh38'), includes_end=True) ] hail_df = functions.from_matrix_table( hl.import_gvcfs([input_vcf], partitions, force_bgz=True, reference_genome='GRCh38')[0]) _assert_lossless_adapter(spark, tmp_path, hail_df, input_vcf, 'vcf', 'bigvcf')
def run_combiner(sample_paths: List[str], out_file: str, tmp_path: str, intervals: Optional[List[hl.utils.Interval]] = None, header: Optional[str] = None, sample_names: Optional[List[str]] = None, branch_factor: int = CombinerConfig.default_branch_factor, batch_size: int = CombinerConfig.default_batch_size, target_records: int = CombinerConfig.default_target_records, overwrite: bool = False, reference_genome: str = 'default', contig_recoding: Optional[Dict[str, str]] = None, key_by_locus_and_alleles: bool = False): """Run the Hail VCF combiner, performing a hierarchical merge to create a combined sparse matrix table. Parameters ---------- sample_paths : :obj:`list` of :obj:`str` Paths to individual GVCFs. out_file : :obj:`str` Path to final combined matrix table. tmp_path : :obj:`str` Path for intermediate output. intervals : list of :class:`.Interval` or None Partitioning with which to import GVCFs in first phase of combiner. header : :obj:`str` or None External header file to use as GVCF header for all inputs. If defined, `sample_names` must be defined as well. sample_names: list of :obj:`str` or None Sample names, to be used with `header`. branch_factor : :obj:`int` Combiner branch factor. batch_size : :obj:`int` Combiner batch size. target_records : :obj:`int` Target records per partition in each combiner phase after the first. overwrite : :obj:`bool` Overwrite output file, if it exists. reference_genome : :obj:`str` Reference genome for GVCF import. contig_recoding: :obj:`dict` of (:obj:`str`, :obj:`str`), optional Mapping from contig name in gVCFs to contig name the reference genome. All contigs must be present in the `reference_genome`, so this is useful for mapping differently-formatted data onto known references. key_by_locus_and_alleles : :obj:`bool` Key by both locus and alleles in the final output. Returns ------- None """ tmp_path += f'/combiner-temporary/{uuid.uuid4()}/' if header is not None: assert sample_names is not None assert len(sample_names) == len(sample_paths) # FIXME: this should be hl.default_reference().even_intervals_contig_boundary intervals = intervals or default_exome_intervals(reference_genome) config = CombinerConfig(branch_factor=branch_factor, batch_size=batch_size, target_records=target_records) plan = config.plan(len(sample_paths)) files_to_merge = sample_paths n_phases = len(plan.phases) total_ops = len(files_to_merge) * n_phases total_work_done = 0 for phase_i, phase in enumerate(plan.phases): phase_i += 1 # used for info messages, 1-indexed for readability n_jobs = len(phase.jobs) merge_str = 'input GVCFs' if phase_i == 1 else 'intermediate sparse matrix tables' job_str = hl.utils.misc.plural('job', n_jobs) info( f"Starting phase {phase_i}/{n_phases}, merging {len(files_to_merge)} {merge_str} in {n_jobs} {job_str}." ) if phase_i > 1: intervals = calculate_new_intervals( hl.read_matrix_table(files_to_merge[0]).rows(), config.target_records, reference_genome=reference_genome) new_files_to_merge = [] for job_i, job in enumerate(phase.jobs): job_i += 1 # used for info messages, 1-indexed for readability n_merges = len(job.merges) merge_str = hl.utils.misc.plural('file', n_merges) pct_total = 100 * job.input_total_size / total_ops info( f"Starting phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)} to create {n_merges} merged {merge_str}, corresponding to ~{pct_total:.1f}% of total I/O." ) merge_mts: List[MatrixTable] = [] for merge in job.merges: inputs = [files_to_merge[i] for i in merge.inputs] if phase_i == 1: mts = [ transform_gvcf(vcf) for vcf in hl.import_gvcfs( inputs, intervals, array_elements_required=False, _external_header=header, _external_sample_ids=[ sample_names[i] for i in merge.inputs ] if header is not None else None, reference_genome=reference_genome, contig_recoding=contig_recoding) ] else: mts = [ hl.read_matrix_table(path, _intervals=intervals) for path in inputs ] merge_mts.append(combine_gvcfs(mts)) if phase_i == n_phases: # final merge! assert n_jobs == 1 assert len(merge_mts) == 1 [final_mt] = merge_mts if key_by_locus_and_alleles: final_mt = MatrixTable( MatrixKeyRowsBy(final_mt._mir, ['locus', 'alleles'], is_sorted=True)) final_mt.write(out_file, overwrite=overwrite) new_files_to_merge = [out_file] info( f"Finished phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, 100% of total I/O finished." ) break tmp = f'{tmp_path}_phase{phase_i}_job{job_i}/' hl.experimental.write_matrix_tables(merge_mts, tmp, overwrite=True) pad = len(str(len(merge_mts))) new_files_to_merge.extend(tmp + str(n).zfill(pad) + '.mt' for n in range(len(merge_mts))) total_work_done += job.input_total_size info( f"Finished {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, {100 * total_work_done / total_ops:.1f}% of total I/O finished." ) info(f"Finished phase {phase_i}/{n_phases}.") files_to_merge = new_files_to_merge assert files_to_merge == [out_file] info("Finished!")
import hail as hl gvcfs = ['gs://hail-common/test-resources/HG00096.g.vcf.gz', 'gs://hail-common/test-resources/HG00268.g.vcf.gz'] hl.init(default_reference='GRCh38') parts_json = [ {'start': {'locus': {'contig': 'chr20', 'position': 17821257}}, 'end': {'locus': {'contig': 'chr20', 'position': 18708366}}, 'includeStart': True, 'includeEnd': True}, {'start': {'locus': {'contig': 'chr20', 'position': 18708367}}, 'end': {'locus': {'contig': 'chr20', 'position': 19776611}}, 'includeStart': True, 'includeEnd': True}, {'start': {'locus': {'contig': 'chr20', 'position': 19776612}}, 'end': {'locus': {'contig': 'chr20', 'position': 21144633}}, 'includeStart': True, 'includeEnd': True}, ] parts = hl.tarray(hl.tinterval(hl.tstruct(locus=hl.tlocus('GRCh38'))))._convert_from_json(parts_json) for mt in hl.import_gvcfs(gvcfs, parts): mt._force_count_rows()
def import_gvcf_force_count(path): intervals = vc_all.default_exome_intervals('GRCh38') [mt] = hl.import_gvcfs([path], intervals, reference_genome='GRCh38') mt._force_count_rows()
def import_and_transform_gvcf(path): intervals = vc_all.default_exome_intervals('GRCh38') [mt] = hl.import_gvcfs([path], intervals, reference_genome='GRCh38') mt = vc_all.transform_gvcf(mt) mt._force_count()
def run_combiner(sample_paths: List[str], out_file: str, tmp_path: str, *, intervals: Optional[List[hl.utils.Interval]] = None, import_interval_size: Optional[int] = None, use_genome_default_intervals: bool = False, use_exome_default_intervals: bool = False, header: Optional[str] = None, sample_names: Optional[List[str]] = None, branch_factor: int = CombinerConfig.default_branch_factor, batch_size: int = CombinerConfig.default_batch_size, target_records: int = CombinerConfig.default_target_records, overwrite: bool = False, reference_genome: str = 'default', contig_recoding: Optional[Dict[str, str]] = None, key_by_locus_and_alleles: bool = False): """Run the Hail VCF combiner, performing a hierarchical merge to create a combined sparse matrix table. **Partitioning** The partitioning of input GVCFs, which determines the maximum parallelism per file, is determined the four parameters below. One of these parameters must be passed to this function. - `intervals` -- User-supplied intervals. - `import_interval_size` -- Use intervals of this uniform size across the genome. - `use_genome_default_intervals` -- Use intervals of typical uniform size for whole genome GVCFs. - `use_exome_default_intervals` -- Use intervals of typical uniform size for exome GVCFs. It is recommended that new users include either `use_genome_default_intervals` or `use_exome_default_intervals`. Note also that the partitioning of the final, combined matrix table does not depend the GVCF input partitioning. Parameters ---------- sample_paths : :obj:`list` of :class:`str` Paths to individual GVCFs. out_file : :class:`str` Path to final combined matrix table. tmp_path : :class:`str` Path for intermediate output. intervals : list of :class:`.Interval` or None Import GVCFs with specified partition intervals. import_interval_size : :obj:`int` or None Import GVCFs with uniform partition intervals of specified size. use_genome_default_intervals : :obj:`bool` Import GVCFs with uniform partition intervals of default size for whole-genome data. use_exome_default_intervals : :obj:`bool` Import GVCFs with uniform partition intervals of default size for exome data. header : :class:`str` or None External header file to use as GVCF header for all inputs. If defined, `sample_names` must be defined as well. sample_names: list of :class:`str` or None Sample names, to be used with `header`. branch_factor : :obj:`int` Combiner branch factor. batch_size : :obj:`int` Combiner batch size. target_records : :obj:`int` Target records per partition in each combiner phase after the first. overwrite : :obj:`bool` Overwrite output file, if it exists. reference_genome : :class:`str` Reference genome for GVCF import. contig_recoding: :obj:`dict` of (:class:`str`, :obj:`str`), optional Mapping from contig name in gVCFs to contig name the reference genome. All contigs must be present in the `reference_genome`, so this is useful for mapping differently-formatted data onto known references. key_by_locus_and_alleles : :obj:`bool` Key by both locus and alleles in the final output. Returns ------- None """ tmp_path += f'/combiner-temporary/{uuid.uuid4()}/' if header is not None: assert sample_names is not None assert len(sample_names) == len(sample_paths) n_partition_args = (int(intervals is not None) + int(import_interval_size is not None) + int(use_genome_default_intervals) + int(use_exome_default_intervals)) if n_partition_args == 0: raise ValueError( "'run_combiner': require one argument from 'intervals', 'import_interval_size', " "'use_genome_default_intervals', or 'use_exome_default_intervals' to choose GVCF partitioning" ) if n_partition_args > 1: warning( "'run_combiner': multiple colliding arguments found from 'intervals', 'import_interval_size', " "'use_genome_default_intervals', or 'use_exome_default_intervals'." "\n The argument found first in the list in this warning will be used, and others ignored." ) if intervals is not None: info( f"Using {len(intervals)} user-supplied intervals as partitioning for GVCF import" ) elif import_interval_size is not None: intervals = calculate_even_genome_partitioning(reference_genome, import_interval_size) info(f"Using {len(intervals)} intervals with user-supplied size" f" {import_interval_size} as partitioning for GVCF import") elif use_genome_default_intervals: size = CombinerConfig.default_genome_interval_size intervals = calculate_even_genome_partitioning(reference_genome, size) info(f"Using {len(intervals)} intervals with default whole-genome size" f" {size} as partitioning for GVCF import") elif use_exome_default_intervals: size = CombinerConfig.default_exome_interval_size intervals = calculate_even_genome_partitioning(reference_genome, size) info(f"Using {len(intervals)} intervals with default exome size" f" {size} as partitioning for GVCF import") assert intervals is not None config = CombinerConfig(branch_factor=branch_factor, batch_size=batch_size, target_records=target_records) plan = config.plan(len(sample_paths)) files_to_merge = sample_paths n_phases = len(plan.phases) total_ops = len(files_to_merge) * n_phases total_work_done = 0 for phase_i, phase in enumerate(plan.phases): phase_i += 1 # used for info messages, 1-indexed for readability n_jobs = len(phase.jobs) merge_str = 'input GVCFs' if phase_i == 1 else 'intermediate sparse matrix tables' job_str = hl.utils.misc.plural('job', n_jobs) info( f"Starting phase {phase_i}/{n_phases}, merging {len(files_to_merge)} {merge_str} in {n_jobs} {job_str}." ) if phase_i > 1: intervals = calculate_new_intervals( hl.read_matrix_table(files_to_merge[0]).rows(), config.target_records, reference_genome=reference_genome) new_files_to_merge = [] for job_i, job in enumerate(phase.jobs): job_i += 1 # used for info messages, 1-indexed for readability n_merges = len(job.merges) merge_str = hl.utils.misc.plural('file', n_merges) pct_total = 100 * job.input_total_size / total_ops info( f"Starting phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)} to create {n_merges} merged {merge_str}, corresponding to ~{pct_total:.1f}% of total I/O." ) merge_mts: List[MatrixTable] = [] for merge in job.merges: inputs = [files_to_merge[i] for i in merge.inputs] if phase_i == 1: mts = [ transform_gvcf(vcf) for vcf in hl.import_gvcfs( inputs, intervals, array_elements_required=False, _external_header=header, _external_sample_ids=[[sample_names[i]] for i in merge.inputs] if header is not None else None, reference_genome=reference_genome, contig_recoding=contig_recoding) ] else: mts = [ hl.read_matrix_table(path, _intervals=intervals) for path in inputs ] merge_mts.append(combine_gvcfs(mts)) if phase_i == n_phases: # final merge! assert n_jobs == 1 assert len(merge_mts) == 1 [final_mt] = merge_mts if key_by_locus_and_alleles: final_mt = MatrixTable( MatrixKeyRowsBy(final_mt._mir, ['locus', 'alleles'], is_sorted=True)) final_mt.write(out_file, overwrite=overwrite) new_files_to_merge = [out_file] info( f"Finished phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, 100% of total I/O finished." ) break tmp = f'{tmp_path}_phase{phase_i}_job{job_i}/' hl.experimental.write_matrix_tables(merge_mts, tmp, overwrite=True) pad = len(str(len(merge_mts))) new_files_to_merge.extend(tmp + str(n).zfill(pad) + '.mt' for n in range(len(merge_mts))) total_work_done += job.input_total_size info( f"Finished {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, {100 * total_work_done / total_ops:.1f}% of total I/O finished." ) info(f"Finished phase {phase_i}/{n_phases}.") files_to_merge = new_files_to_merge assert files_to_merge == [out_file] info("Finished!")