def test_make_split_files_2(self): """test make_split_files with different input from previous test""" # These records cause a minos bug. Last record was not being used # when merging because the index was wrong. # They are test data from multi_sample_pipeline tests infile = os.path.join(data_dir, "make_split_files2.in.vcf") tmp_out = "tmp.vcf_chunker.make_split_files2" ref_fa = os.path.join(data_dir, "make_split_files2.in.ref.fa") if os.path.exists(tmp_out): shutil.rmtree(tmp_out) chunker = vcf_chunker.VcfChunker( tmp_out, vcf_infile=infile, ref_fasta=ref_fa, variants_per_split=2, flank_length=200, gramtools_kmer_size=5, ) chunker.make_split_files() self.assertTrue(os.path.exists(chunker.metadata_pickle)) chunker2 = vcf_chunker.VcfChunker(tmp_out, gramtools_kmer_size=5) self.assertEqual(1, len(chunker2.vcf_split_files)) self.assertEqual(3, len(chunker2.vcf_split_files["ref.0"])) self.assertEqual(4, chunker2.vcf_split_files["ref.0"][-1].use_end_index) shutil.rmtree(tmp_out) # Test with two threads chunker = vcf_chunker.VcfChunker( tmp_out, vcf_infile=infile, ref_fasta=ref_fa, variants_per_split=2, flank_length=200, threads=2, gramtools_kmer_size=5, ) chunker.make_split_files() self.assertTrue(os.path.exists(chunker.metadata_pickle)) chunker2 = vcf_chunker.VcfChunker(tmp_out, gramtools_kmer_size=5) self.assertEqual(1, len(chunker2.vcf_split_files)) self.assertEqual(3, len(chunker2.vcf_split_files["ref.0"])) self.assertEqual(4, chunker2.vcf_split_files["ref.0"][-1].use_end_index) shutil.rmtree(tmp_out)
def run(options): chunker = vcf_chunker.VcfChunker( options.outdir, vcf_infile=options.vcf_file, ref_fasta=options.ref_fasta, variants_per_split=options.variants_per_split, alleles_per_split=options.alleles_per_split, total_splits=options.total_splits, flank_length=200, gramtools_kmer_size=options.gramtools_kmer_size, threads=options.threads, ) chunker.make_split_files()
def test_merge_files(self): '''test merge_files''' vcf_to_split = os.path.join(data_dir, 'merge_files.in.vcf') ref_fasta = os.path.join(data_dir, 'merge_files.in.ref.fa') tmp_outdir = 'tmp.vcf_chunker.merge_files' chunker = vcf_chunker.VcfChunker(tmp_outdir, vcf_infile=vcf_to_split, ref_fasta=ref_fasta, variants_per_split=4, flank_length=3, gramtools_kmer_size=5) chunker.make_split_files() to_merge = {} for ref, split_list in chunker.vcf_split_files.items(): to_merge[ref] = [x.filename for x in split_list] tmp_vcf_out = 'tmp.vcf_chunker.merge_files.out.vcf' chunker.merge_files(to_merge, tmp_vcf_out) self.assertTrue(filecmp.cmp(vcf_to_split, tmp_vcf_out, shallow=False)) os.unlink(tmp_vcf_out) shutil.rmtree(tmp_outdir)
def test_make_split_files(self): '''test make_split_files''' infile = os.path.join(data_dir, 'make_split_files.in.vcf') tmp_out = 'tmp.vcf_chunker.make_split_files' ref_fa = os.path.join(data_dir, 'make_split_files.in.ref.fa') if os.path.exists(tmp_out): shutil.rmtree(tmp_out) vcf1 = cluster_vcf_records.vcf_record.VcfRecord( 'ref1\t1\t.\tG\tT\t.\tPASS\t.\t.\t.') vcf2 = cluster_vcf_records.vcf_record.VcfRecord( 'ref1\t2\t.\tC\tT\t.\tPASS\t.\t.\t.') vcf3 = cluster_vcf_records.vcf_record.VcfRecord( 'ref1\t3\t.\tT\tA\t.\tPASS\t.\t.\t.') vcf4 = cluster_vcf_records.vcf_record.VcfRecord( 'ref1\t5\t.\tAGAGTCACGTA\tG\t.\tPASS\t.\t.\t.') vcf5 = cluster_vcf_records.vcf_record.VcfRecord( 'ref1\t18\t.\tA\tG\t.\tPASS\t.\t.\t.') vcf6 = cluster_vcf_records.vcf_record.VcfRecord( 'ref1\t21\t.\tG\tT\t.\tPASS\t.\t.\t.') vcf7 = cluster_vcf_records.vcf_record.VcfRecord( 'ref2\t42\t.\tC\tG\t.\tPASS\t.\t.\t.') header_lines = [ '##header1', '##header2', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_name' ] chunker = vcf_chunker.VcfChunker(tmp_out, vcf_infile=infile, ref_fasta=ref_fa, variants_per_split=2, flank_length=1, gramtools_kmer_size=5) chunker.make_split_files() self.assertTrue(os.path.exists(chunker.metadata_pickle)) got_header, got_records = cluster_vcf_records.vcf_file_read.vcf_file_to_list( os.path.join(tmp_out, 'split.0.in.vcf')) self.assertEqual(header_lines, got_header) self.assertEqual([vcf1, vcf2, vcf3], got_records) got_header, got_records = cluster_vcf_records.vcf_file_read.vcf_file_to_list( os.path.join(tmp_out, 'split.1.in.vcf')) self.assertEqual(header_lines, got_header) self.assertEqual([vcf2, vcf3, vcf4], got_records) got_header, got_records = cluster_vcf_records.vcf_file_read.vcf_file_to_list( os.path.join(tmp_out, 'split.2.in.vcf')) self.assertEqual(header_lines, got_header) self.assertEqual([vcf5, vcf6], got_records) got_header, got_records = cluster_vcf_records.vcf_file_read.vcf_file_to_list( os.path.join(tmp_out, 'split.3.in.vcf')) self.assertEqual(header_lines, got_header) self.assertEqual([vcf7], got_records) self.assertFalse( os.path.exists(os.path.join(tmp_out, 'split.4.in.vcf'))) shutil.rmtree(tmp_out) chunker = vcf_chunker.VcfChunker(tmp_out, vcf_infile=infile, ref_fasta=ref_fa, variants_per_split=4, flank_length=3, gramtools_kmer_size=5) chunker.make_split_files() self.assertTrue(os.path.exists(chunker.metadata_pickle)) got_header, got_records = cluster_vcf_records.vcf_file_read.vcf_file_to_list( os.path.join(tmp_out, 'split.0.in.vcf')) self.assertEqual(header_lines, got_header) self.assertEqual([vcf1, vcf2, vcf3, vcf4, vcf5], got_records) got_header, got_records = cluster_vcf_records.vcf_file_read.vcf_file_to_list( os.path.join(tmp_out, 'split.1.in.vcf')) self.assertEqual(header_lines, got_header) self.assertEqual([vcf4, vcf5, vcf6], got_records) got_header, got_records = cluster_vcf_records.vcf_file_read.vcf_file_to_list( os.path.join(tmp_out, 'split.2.in.vcf')) self.assertEqual(header_lines, got_header) self.assertEqual([vcf7], got_records) self.assertFalse( os.path.exists(os.path.join(tmp_out, 'split.3.in.vcf'))) chunker2 = vcf_chunker.VcfChunker(tmp_out, gramtools_kmer_size=5) self.assertEqual(chunker.vcf_infile, chunker2.vcf_infile) self.assertEqual(chunker.ref_fasta, chunker2.ref_fasta) self.assertEqual(chunker.variants_per_split, chunker2.variants_per_split) self.assertEqual(chunker.total_splits, chunker2.total_splits) self.assertEqual(chunker.flank_length, chunker2.flank_length) self.assertEqual(chunker.gramtools_kmer_size, chunker2.gramtools_kmer_size) self.assertEqual(chunker.max_read_length, chunker2.max_read_length) self.assertEqual(chunker.total_split_files, chunker2.total_split_files) self.assertEqual(chunker.vcf_split_files, chunker2.vcf_split_files) shutil.rmtree(tmp_out)
def _run_gramtools_with_split_vcf(self): logging.info('Splitting VCF files into chunks (if not already done)') chunker = vcf_chunker.VcfChunker( self.split_input_dir, vcf_infile=self.clustered_vcf, ref_fasta=self.ref_fasta, variants_per_split=self.variants_per_split, alleles_per_split=self.alleles_per_split, max_read_length=self.max_read_length, total_splits=self.total_splits, flank_length=self.max_read_length, gramtools_kmer_size=self.gramtools_kmer_size, ) chunker.make_split_files() self.gramtools_kmer_size = chunker.gramtools_kmer_size logging.info('VCF file split into ' + str(chunker.total_split_files) + ' chunks') try: os.mkdir(self.split_output_dir) except: raise Error('Error making output split directory ' + self.split_output_dir) unmapped_reads_file = os.path.join(self.split_output_dir, 'unmapped_reads.bam') bam_read_extract.get_unmapped_reads(self.reads_files[0], unmapped_reads_file) split_vcf_outfiles = {} split_vcf_outfiles_unfiltered = {} mean_depths = [] depth_variances = [] for ref_name, split_file_list in chunker.vcf_split_files.items(): split_vcf_outfiles[ref_name] = [] split_vcf_outfiles_unfiltered[ref_name] = [] for split_file in split_file_list: logging.info( '===== Start analysing variants in VCF split file ' + split_file.filename + ' =====') split_reads_file = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.reads.bam') bam_read_extract.get_region( self.reads_files[0], split_file.chrom, split_file.chrom_start, split_file.chrom_end, split_reads_file, ) gramtools_quasimap_dir = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.gramtools.quasimap') build_report, quasimap_report = gramtools.run_gramtools( split_file.gramtools_build_dir, gramtools_quasimap_dir, split_file.filename, self.ref_fasta, [unmapped_reads_file, split_reads_file], self.max_read_length, kmer_size=self.gramtools_kmer_size, ) logging.info('Loading split gramtools quasimap output files ' + gramtools_quasimap_dir) perl_generated_vcf = os.path.join( split_file.gramtools_build_dir, 'perl_generated_vcf') mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files( perl_generated_vcf, gramtools_quasimap_dir) mean_depths.append(mean_depth) depth_variances.append(depth_variance) logging.info('Finished loading gramtools files') if self.sample_name is None: sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines( vcf_header) else: sample_name = self.sample_name assert sample_name is not None split_vcf_out = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.out.vcf') unfiltered_vcf_out = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.out.debug.calls_with_zero_cov_alleles.vcf') logging.info('Writing VCf output file ' + split_vcf_out + ' for split VCF file ' + split_file.filename) gramtools.write_vcf_annotated_using_coverage_from_gramtools( mean_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, unfiltered_vcf_out, self.gramtools_kmer_size, sample_name=sample_name, max_read_length=self.max_read_length, filtered_outfile=split_vcf_out, ) split_vcf_outfiles[ref_name].append(split_vcf_out) split_vcf_outfiles_unfiltered[ref_name].append( unfiltered_vcf_out) if self.clean: logging.info( 'Cleaning gramtools files from split VCF file ' + split_file.filename) if not self.user_supplied_gramtools_build_dir: os.rename( os.path.join(split_file.gramtools_build_dir, 'build_report.json'), split_file.gramtools_build_dir + '.report.json') shutil.rmtree(split_file.gramtools_build_dir) os.unlink(split_file.filename) os.rename( os.path.join(gramtools_quasimap_dir, 'report.json'), gramtools_quasimap_dir + '.report.json') shutil.rmtree(gramtools_quasimap_dir) os.unlink(split_reads_file) logging.info( '===== Finish analysing variants in VCF split file ' + split_file.filename + ' =====') logging.info('Merging VCF files into one output file ' + self.final_vcf) chunker.merge_files(split_vcf_outfiles, self.final_vcf) chunker.merge_files(split_vcf_outfiles_unfiltered, self.unfiltered_vcf_file) mean_depth = statistics.mean(mean_depths) depth_variance = statistics.mean(depth_variances) logging.info( f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations' ) Adjudicator._add_gt_conf_percentile_to_vcf_file( self.final_vcf, mean_depth, depth_variance, self.read_error_rate, self.genotype_simulation_iterations) if self.clean: logging.info('Deleting temp split VCF files') for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered: for file_list in d.values(): for filename in file_list: os.unlink(filename) os.unlink(unmapped_reads_file)
def _run_gramtools_with_split_vcf(self): logging.info("Splitting VCF files into chunks (if not already done)") chunker = vcf_chunker.VcfChunker( self.split_input_dir, vcf_infile=self.clustered_vcf, ref_fasta=self.ref_fasta, variants_per_split=self.variants_per_split, alleles_per_split=self.alleles_per_split, total_splits=self.total_splits, gramtools_kmer_size=self.gramtools_kmer_size, ) chunker.make_split_files() self.gramtools_kmer_size = chunker.gramtools_kmer_size logging.info("VCF file split into " + str(chunker.total_split_files) + " chunks") try: os.mkdir(self.split_output_dir) except: raise Exception("Error making output split directory " + self.split_output_dir) if self.use_unmapped_reads: unmapped_reads_file = os.path.join(self.split_output_dir, "unmapped_reads.bam") bam_read_extract.get_unmapped_reads(self.reads_files[0], unmapped_reads_file) else: unmapped_reads_file = None read_coverage = [] build_reports = {} quasimap_reports = {} # Run gramtools quasimap on each split. Get back the read depth # from each split, which we need to get the global read depth and # variance, to then use for genotyping for ref_name, split_file_list in chunker.vcf_split_files.items(): for split_file in split_file_list: read_cov, build_report, quasimap_report = self._run_quasimap_one_split( split_file, unmapped_reads_file) read_coverage.extend(read_cov) build_reports[split_file.file_number] = build_report quasimap_reports[split_file.file_number] = quasimap_report with open(self.gramtools_quasimap_json, "w") as f: json.dump(quasimap_reports, f, indent=2, sort_keys=True) if not self.user_supplied_gramtools_build_dir: with open(self.gramtools_build_json, "w") as f: json.dump(build_reports, f, indent=2, sort_keys=True) self.mean_depth = round(statistics.mean(read_coverage), 3) self.variance_depth = round(statistics.variance(read_coverage), 3) # Can now genotype each split VCF, using the global mean depth and variance split_vcf_outfiles = {} split_vcf_outfiles_unfiltered = {} for ref_name, split_file_list in chunker.vcf_split_files.items(): split_vcf_outfiles[ref_name] = [] split_vcf_outfiles_unfiltered[ref_name] = [] for split_file in split_file_list: build_vcf = os.path.join(split_file.gramtools_build_dir, "build.vcf") quasimap_dir = os.path.join( self.split_output_dir, f"split.{split_file.file_number}.gramtools.quasimap", ) logging.info(f"Loading gramtools quasimap output files " + quasimap_dir) ( _, # mean depth for this split, which we don't want _, # depth variance for this split, which we don't want vcf_header, vcf_records, allele_coverage, allele_groups, ) = gramtools.load_gramtools_vcf_and_allele_coverage_files( build_vcf, quasimap_dir) logging.info("Finished loading gramtools files") if self.clean: shutil.rmtree(quasimap_dir) vcf_prefix = os.path.join( self.split_output_dir, f"split.{split_file.file_number}.out", ) split_vcf_out = f"{vcf_prefix}.vcf" unfiltered_vcf_out = ( f"{vcf_prefix}.debug.calls_with_zero_cov_alleles.vcf") gramtools.write_vcf_annotated_using_coverage_from_gramtools( self.mean_depth, self.variance_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, unfiltered_vcf_out, sample_name=self.sample_name, filtered_outfile=split_vcf_out, ref_seq_lengths=self.ref_seq_lengths, call_hets=self.call_hets, ) split_vcf_outfiles[ref_name].append(split_vcf_out) split_vcf_outfiles_unfiltered[ref_name].append( unfiltered_vcf_out) # We now have minos run on each split VCF. Merge into one VCF, then can # add gt conf and gcp to the merged VCF. logging.info("Merging VCF files into one output file " + self.final_vcf) chunker.merge_files(split_vcf_outfiles, self.final_vcf) chunker.merge_files(split_vcf_outfiles_unfiltered, self.unfiltered_vcf_file) self.run_gt_conf() if self.clean: logging.info("Deleting temp split VCF files") for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered: for file_list in d.values(): for filename in file_list: os.unlink(filename) if self.use_unmapped_reads: os.unlink(unmapped_reads_file)
def _run_gramtools_with_split_vcf(self): logging.info("Splitting VCF files into chunks (if not already done)") chunker = vcf_chunker.VcfChunker( self.split_input_dir, vcf_infile=self.clustered_vcf, ref_fasta=self.ref_fasta, variants_per_split=self.variants_per_split, alleles_per_split=self.alleles_per_split, max_read_length=self.max_read_length, total_splits=self.total_splits, flank_length=self.max_read_length, gramtools_kmer_size=self.gramtools_kmer_size, ) chunker.make_split_files() self.gramtools_kmer_size = chunker.gramtools_kmer_size logging.info("VCF file split into " + str(chunker.total_split_files) + " chunks") try: os.mkdir(self.split_output_dir) except: raise Exception("Error making output split directory " + self.split_output_dir) if self.use_unmapped_reads: unmapped_reads_file = os.path.join(self.split_output_dir, "unmapped_reads.bam") bam_read_extract.get_unmapped_reads(self.reads_files[0], unmapped_reads_file) split_vcf_outfiles = {} split_vcf_outfiles_unfiltered = {} for ref_name, split_file_list in chunker.vcf_split_files.items(): split_vcf_outfiles[ref_name] = [] split_vcf_outfiles_unfiltered[ref_name] = [] for split_file in split_file_list: logging.info( "===== Start analysing variants in VCF split file " + split_file.filename + " =====") split_reads_file = os.path.join( self.split_output_dir, "split." + str(split_file.file_number) + ".reads.bam", ) bam_read_extract.get_region( self.reads_files[0], split_file.chrom, split_file.chrom_start, split_file.chrom_end, split_reads_file, ) gramtools_quasimap_dir = os.path.join( self.split_output_dir, "split." + str(split_file.file_number) + ".gramtools.quasimap", ) if self.use_unmapped_reads: reads_files = [unmapped_reads_file, split_reads_file] else: reads_files = [split_reads_file] split_vcf_out = os.path.join( self.split_output_dir, "split." + str(split_file.file_number) + ".out.vcf", ) unfiltered_vcf_out = os.path.join( self.split_output_dir, "split." + str(split_file.file_number) + ".out.debug.calls_with_zero_cov_alleles.vcf", ) self.run_adjudicate( split_file.gramtools_build_dir, gramtools_quasimap_dir, split_file.filename, reads_files, split_vcf_out, unfiltered_vcf_out, ) if self.clean: os.unlink(split_reads_file) if not self.user_supplied_gramtools_build_dir: os.unlink(split_file.filename) split_vcf_outfiles[ref_name].append(split_vcf_out) split_vcf_outfiles_unfiltered[ref_name].append( unfiltered_vcf_out) logging.info( "===== Finish analysing variants in VCF split file " + split_file.filename + " =====") logging.info("Merging VCF files into one output file " + self.final_vcf) chunker.merge_files(split_vcf_outfiles, self.final_vcf) chunker.merge_files(split_vcf_outfiles_unfiltered, self.unfiltered_vcf_file) self.run_gt_conf() if self.clean: logging.info("Deleting temp split VCF files") for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered: for file_list in d.values(): for filename in file_list: os.unlink(filename) if self.use_unmapped_reads: os.unlink(unmapped_reads_file)