def setup(path): interval = [ hl.eval( hl.parse_locus_interval('chr1:START-END', reference_genome='GRCh38')) ] return hl.import_vcfs([path], interval, reference_genome='GRCh38')[0]
def h(paths, sample_names, tmp_path, json, header, out_path, i): vcfs = [comb.transform_one(vcf) for vcf in hl.import_vcfs(paths, json, array_elements_required=False, _external_header=header, _external_sample_ids=sample_names)] combined = [comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINE_NUMBER)] if len(paths) <= MAX_COMBINE_NUMBER: # only 1 item, just write it combined[0].write(out_path, overwrite=True) return [] pad = len(str(len(combined))) hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True) return [tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined))]
def run_combiner(sample_list, intervals, out_path, tmp_path, summary_path=None, overwrite=False): import gc # make the temp path a directory, no matter what tmp_path += f'/combiner-temporary/{uuid.uuid4()}/' vcfs = [ comb.transform_one(vcf) for vcf in hl.import_vcfs( sample_list, intervals, array_elements_required=False) ] combined = [ comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINER_LENGTH) ] if len(combined) == 1: combined[0].write(out_path, overwrite=overwrite) else: hl.utils.java.info(f'Writing combiner temporary files to: {tmp_path}') i = 0 while len(combined) > 1: pad = len(str(len(combined))) hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True) paths = [ tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined)) ] i += 1 wmts = [hl.read_matrix_table(path) for path in paths] combined = [ comb.combine_gvcfs(mts) for mts in chunks(wmts, MAX_COMBINER_LENGTH) ] gc.collect() # need to try to free memory on the master combined[0].write(out_path, overwrite=overwrite) if summary_path is not None: mt = hl.read_matrix_table(out_path) comb.summarize(mt).rows().write(summary_path, overwrite=overwrite)
def run_combiner(sample_list, json, out_path, tmp_path, summary_path=None, overwrite=False): import gc # make the temp path a directory, no matter what tmp_path += f'/combiner-temporary/{uuid.uuid4()}/' vcfs = [comb.transform_one(vcf) for vcf in hl.import_vcfs(sample_list, json, array_elements_required=False)] combined = [comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINER_LENGTH)] if len(combined) == 1: combined[0].write(out_path, overwrite=overwrite) else: hl.utils.java.info(f'Writing combiner temporary files to: {tmp_path}') i = 0 while len(combined) > 1: pad = len(str(len(combined))) hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True) paths = [tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined))] i += 1 wmts = [hl.read_matrix_table(path) for path in paths] combined = [comb.combine_gvcfs(mts) for mts in chunks(wmts, MAX_COMBINER_LENGTH)] gc.collect() # need to try to free memory on the master combined[0].write(out_path, overwrite=overwrite) if summary_path is not None: mt = hl.read_matrix_table(out_path) comb.summarize(mt).rows().write(summary_path, overwrite=overwrite)
} }, 'end': { 'locus': { 'contig': 'chr20', 'position': 19776611 } }, 'includeStart': True, 'includeEnd': True }, { 'start': { 'locus': { 'contig': 'chr20', 'position': 19776612 } }, 'end': { 'locus': { 'contig': 'chr20', 'position': 21144633 } }, 'includeStart': True, 'includeEnd': True }, ] parts_str = json.dumps(parts) vcfs = hl.import_vcfs(gvcfs, parts_str)
import json import hail as hl gvcfs = ['gs://hail-ci/gvcfs/HG00096.g.vcf.gz', 'gs://hail-ci/gvcfs/HG00268.g.vcf.gz'] hl.init(default_reference='GRCh38') parts = [ {'start': {'locus': {'contig': 'chr20', 'position': 17821257}}, 'end': {'locus': {'contig': 'chr20', 'position': 18708366}}, 'includeStart': True, 'includeEnd': True}, {'start': {'locus': {'contig': 'chr20', 'position': 18708367}}, 'end': {'locus': {'contig': 'chr20', 'position': 19776611}}, 'includeStart': True, 'includeEnd': True}, {'start': {'locus': {'contig': 'chr20', 'position': 19776612}}, 'end': {'locus': {'contig': 'chr20', 'position': 21144633}}, 'includeStart': True, 'includeEnd': True}, ] parts_str = json.dumps(parts) vcfs = hl.import_vcfs(gvcfs, parts_str)
import hail as hl gvcfs = ['gs://hail-common/test-resources/HG00096.g.vcf.gz', 'gs://hail-common/test-resources/HG00268.g.vcf.gz'] hl.init(default_reference='GRCh38') parts_json = [ {'start': {'locus': {'contig': 'chr20', 'position': 17821257}}, 'end': {'locus': {'contig': 'chr20', 'position': 18708366}}, 'includeStart': True, 'includeEnd': True}, {'start': {'locus': {'contig': 'chr20', 'position': 18708367}}, 'end': {'locus': {'contig': 'chr20', 'position': 19776611}}, 'includeStart': True, 'includeEnd': True}, {'start': {'locus': {'contig': 'chr20', 'position': 19776612}}, 'end': {'locus': {'contig': 'chr20', 'position': 21144633}}, 'includeStart': True, 'includeEnd': True}, ] parts = hl.tarray(hl.tinterval(hl.tstruct(locus=hl.tlocus('GRCh38'))))._convert_from_json(parts_json) for mt in hl.import_vcfs(gvcfs, parts): mt._force_count_rows()