def run_combiner(samples, intervals, out_file, tmp_path, header, overwrite=True): tmp_path += f'/combiner-temporary/{uuid.uuid4()}/' sample_names, paths = [list(x) for x in zip(*samples)] sample_names = [[n] for n in sample_names] assert len(paths) == len(samples) out_paths = stage_one(paths, sample_names, tmp_path, intervals, header, out_file) if not out_paths: return tmp_path += f'{uuid.uuid4()}/' mts = [hl.read_matrix_table(path) for path in out_paths] combined_mts = [ comb.combine_gvcfs(mt) for mt in chunks(mts, MAX_COMBINE_NUMBER) ] i = 0 while len(combined_mts) > 1: tmp = tmp_path + f'{i}/' pad = len(str(len(combined_mts))) hl.experimental.write_matrix_tables(combined_mts, tmp, overwrite=True) paths = [ tmp + str(n).zfill(pad) + '.mt' for n in range(len(combined_mts)) ] mts = [hl.read_matrix_table(path) for path in paths] combined_mts = [ comb.combine_gvcfs(mts) for mt in chunks(mts, MAX_COMBINE_NUMBER) ] i += 1 combined_mts[0].write(out_file, overwrite=overwrite)
def h(paths, sample_names, tmp_path, json, header, out_path, i, first): """inner part of stage one, including transformation from a gvcf into the combiner's format""" vcfs = [ comb.transform_one(vcf) for vcf in hl.import_gvcfs(paths, json, array_elements_required=False, _external_header=header, _external_sample_ids=sample_names) ] combined = [ comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINE_NUMBER) ] if first and len( paths ) <= MAX_COMBINE_NUMBER: # only 1 item, just write it, unless we have already written other items combined[0].write(out_path, overwrite=True) return [] pad = len(str(len(combined))) hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True) return [ tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined)) ]
def run_combiner(sample_list, intervals, out_path, tmp_path, summary_path=None, overwrite=False): import gc # make the temp path a directory, no matter what tmp_path += f'/combiner-temporary/{uuid.uuid4()}/' vcfs = [ comb.transform_one(vcf) for vcf in hl.import_vcfs( sample_list, intervals, array_elements_required=False) ] combined = [ comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINER_LENGTH) ] if len(combined) == 1: combined[0].write(out_path, overwrite=overwrite) else: hl.utils.java.info(f'Writing combiner temporary files to: {tmp_path}') i = 0 while len(combined) > 1: pad = len(str(len(combined))) hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True) paths = [ tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined)) ] i += 1 wmts = [hl.read_matrix_table(path) for path in paths] combined = [ comb.combine_gvcfs(mts) for mts in chunks(wmts, MAX_COMBINER_LENGTH) ] gc.collect() # need to try to free memory on the master combined[0].write(out_path, overwrite=overwrite) if summary_path is not None: mt = hl.read_matrix_table(out_path) comb.summarize(mt).rows().write(summary_path, overwrite=overwrite)
def h(paths, sample_names, tmp_path, json, header, out_path, i): vcfs = [comb.transform_one(vcf) for vcf in hl.import_vcfs(paths, json, array_elements_required=False, _external_header=header, _external_sample_ids=sample_names)] combined = [comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINE_NUMBER)] if len(paths) <= MAX_COMBINE_NUMBER: # only 1 item, just write it combined[0].write(out_path, overwrite=True) return [] pad = len(str(len(combined))) hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True) return [tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined))]
def run_combiner(sample_list, json, out_path, tmp_path, summary_path=None, overwrite=False): import gc # make the temp path a directory, no matter what tmp_path += f'/combiner-temporary/{uuid.uuid4()}/' vcfs = [comb.transform_one(vcf) for vcf in hl.import_vcfs(sample_list, json, array_elements_required=False)] combined = [comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINER_LENGTH)] if len(combined) == 1: combined[0].write(out_path, overwrite=overwrite) else: hl.utils.java.info(f'Writing combiner temporary files to: {tmp_path}') i = 0 while len(combined) > 1: pad = len(str(len(combined))) hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True) paths = [tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined))] i += 1 wmts = [hl.read_matrix_table(path) for path in paths] combined = [comb.combine_gvcfs(mts) for mts in chunks(wmts, MAX_COMBINER_LENGTH)] gc.collect() # need to try to free memory on the master combined[0].write(out_path, overwrite=overwrite) if summary_path is not None: mt = hl.read_matrix_table(out_path) comb.summarize(mt).rows().write(summary_path, overwrite=overwrite)
def python_only_10k_combine(path): vcf = setup(path) mt = comb.transform_gvcf(vcf) mts = [mt] * 10_000 _ = [comb.combine_gvcfs(mts) for mts in chunks(mts, COMBINE_GVCF_MAX)]
def compile_2k_merge(path): vcf = setup(path) vcfs = [comb.transform_gvcf(vcf)] * COMBINE_GVCF_MAX combined = [comb.combine_gvcfs(vcfs)] * 20 with TemporaryDirectory() as tmpdir: hl.experimental.write_matrix_tables(combined, os.path.join(tmpdir, 'combiner-multi-write'), overwrite=True)