def test_run_gramtools_fails(self): """test run_gramtools when fails""" # Don't trust error code. Instead, we check # that gramtools wrote the quesimap files we expected, as this # is a good proxy for success. One way to stop these files # from being written is to have no variants in the input VCF, # so that's what we do here tmp_out_build = "tmp.run_gramtools.fail.out.build" tmp_out_quasimap = "tmp.run_gramtools.fail.out.quasimap" if os.path.exists(tmp_out_build): shutil.rmtree(tmp_out_build) if os.path.exists(tmp_out_quasimap): shutil.rmtree(tmp_out_quasimap) vcf_file = os.path.join(data_dir, "run_gramtools.empty.vcf") ref_file = os.path.join(data_dir, "run_gramtools.ref.fa") reads_file = os.path.join(data_dir, "run_gramtools.reads.fq") with self.assertRaises(Exception): gramtools.run_gramtools( tmp_out_build, tmp_out_quasimap, vcf_file, ref_file, reads_file, 150, kmer_size=5, ) shutil.rmtree(tmp_out_build)
def _run_gramtools_not_split_vcf(self): self.gramtools_kmer_size = Adjudicator._get_gramtools_kmer_size( self.gramtools_build_dir, self.gramtools_kmer_size) build_report, quasimap_report = gramtools.run_gramtools( self.gramtools_build_dir, self.gramtools_quasimap_dir, self.clustered_vcf, self.ref_fasta, self.reads_files, self.max_read_length, kmer_size=self.gramtools_kmer_size, ) logging.info('Loading gramtools quasimap output files ' + self.gramtools_quasimap_dir) mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files( self.perl_generated_vcf, self.gramtools_quasimap_dir) logging.info('Finished loading gramtools files') if self.sample_name is None: sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines( vcf_header) else: sample_name = self.sample_name assert sample_name is not None logging.info('Writing VCf output file ' + self.final_vcf) gramtools.write_vcf_annotated_using_coverage_from_gramtools( mean_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, self.unfiltered_vcf_file, self.gramtools_kmer_size, sample_name=sample_name, max_read_length=self.max_read_length, filtered_outfile=self.final_vcf) logging.info( f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations' ) Adjudicator._add_gt_conf_percentile_to_vcf_file( self.final_vcf, mean_depth, depth_variance, self.read_error_rate, self.genotype_simulation_iterations) if self.clean: os.rename( os.path.join(self.gramtools_quasimap_dir, 'report.json'), os.path.join(self.outdir, 'gramtools.quasimap.report.json')) shutil.rmtree(self.gramtools_quasimap_dir) if not self.user_supplied_gramtools_build_dir: os.rename( os.path.join(self.gramtools_build_dir, 'build_report.json'), os.path.join(self.outdir, 'gramtools.build.report.json')) shutil.rmtree(self.gramtools_build_dir)
def run_adjudicate(self, build_dir, quasimap_dir, vcf, reads_files, final_vcf, debug_vcf): build_report, quasimap_report = gramtools.run_gramtools( build_dir, quasimap_dir, vcf, self.ref_fasta, reads_files, kmer_size=self.gramtools_kmer_size, ) build_vcf = os.path.join(build_dir, "build.vcf") logging.info("Loading gramtools quasimap output files " + quasimap_dir) ( self.mean_depth, self.variance_depth, vcf_header, vcf_records, allele_coverage, allele_groups, ) = gramtools.load_gramtools_vcf_and_allele_coverage_files( build_vcf, quasimap_dir) logging.info("Finished loading gramtools files") if self.clean: os.rename( os.path.join(quasimap_dir, "quasimap_outputs", "quasimap_report.json"), self.gramtools_quasimap_json, ) shutil.rmtree(quasimap_dir) if not self.user_supplied_gramtools_build_dir: os.rename( os.path.join(build_dir, "build_report.json"), self.gramtools_build_json, ) shutil.rmtree(build_dir) gramtools.write_vcf_annotated_using_coverage_from_gramtools( self.mean_depth, self.variance_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, debug_vcf, sample_name=self.sample_name, filtered_outfile=final_vcf, ref_seq_lengths=self.ref_seq_lengths, call_hets=self.call_hets, )
def run_adjudicate(self, build_dir, quasimap_dir, vcf, reads_files, final_vcf, debug_vcf): build_report, quasimap_report = gramtools.run_gramtools( build_dir, quasimap_dir, vcf, self.ref_fasta, reads_files, self.max_read_length, kmer_size=self.gramtools_kmer_size, ) build_vcf = os.path.join(build_dir, "build.vcf") logging.info("Loading gramtools quasimap output files " + quasimap_dir) mean_depth, variance_depth, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files( build_vcf, quasimap_dir) Adjudicator.mean_depths.append(mean_depth) Adjudicator.variance_depths.append(variance_depth) logging.info("Finished loading gramtools files") if self.clean: os.rename( os.path.join(quasimap_dir, "quasimap_outputs", "quasimap_report.json"), quasimap_dir + ".report.json", ) shutil.rmtree(quasimap_dir) if not self.user_supplied_gramtools_build_dir: os.rename( os.path.join(build_dir, "build_report.json"), os.path.join(build_dir, "build.report.json"), ) shutil.rmtree(build_dir) if self.sample_name is None: sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines( vcf_header) else: sample_name = self.sample_name gramtools.write_vcf_annotated_using_coverage_from_gramtools( mean_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, debug_vcf, sample_name=sample_name, max_read_length=self.max_read_length, filtered_outfile=final_vcf, )
def test_run_gramtools_two_reads_files(self): """test run_gramtools""" tmp_out_build = "tmp.run_gramtools.2files.out.build" tmp_out_quasimap = "tmp.run_gramtools.2files.out.quasimap" if os.path.exists(tmp_out_build): shutil.rmtree(tmp_out_build) if os.path.exists(tmp_out_quasimap): shutil.rmtree(tmp_out_quasimap) vcf_file = os.path.join(data_dir, "run_gramtools.calls.vcf") ref_file = os.path.join(data_dir, "run_gramtools.ref.fa") reads_file1 = os.path.join(data_dir, "run_gramtools.reads_1.fq") reads_file2 = os.path.join(data_dir, "run_gramtools.reads_2.fq") gramtools.run_gramtools( tmp_out_build, tmp_out_quasimap, vcf_file, ref_file, [reads_file1, reads_file2], 150, kmer_size=5, ) # We're trusing gramtools output for this test. The point here is to check # that gramtools can run. Parsing its output is checked elsewhere. self.assertTrue(os.path.exists(tmp_out_build)) self.assertTrue(os.path.exists(tmp_out_quasimap)) self.assertTrue( os.path.exists( os.path.join(tmp_out_quasimap, "quasimap_outputs", "allele_base_coverage.json"))) self.assertTrue( os.path.exists( os.path.join( tmp_out_quasimap, "quasimap_outputs", "grouped_allele_counts_coverage.json", ))) shutil.rmtree(tmp_out_build) shutil.rmtree(tmp_out_quasimap)
def _run_quasimap_one_split(self, split_file, unmapped_reads_file=None): logging.info(f"Start quasimap on split file {split_file.filename}") split_reads_file = os.path.join( self.split_output_dir, f"split.{split_file.file_number}.reads.bam", ) bam_read_extract.get_region( self.reads_files[0], split_file.chrom, split_file.chrom_start, split_file.chrom_end, split_reads_file, ) quasimap_dir = os.path.join( self.split_output_dir, f"split.{split_file.file_number}.gramtools.quasimap", ) if self.use_unmapped_reads: reads_files = [unmapped_reads_file, split_reads_file] else: reads_files = [split_reads_file] build_report, quasimap_report = gramtools.run_gramtools( split_file.gramtools_build_dir, quasimap_dir, split_file.filename, self.ref_fasta, reads_files, kmer_size=self.gramtools_kmer_size, ) read_cov = self._get_read_coverage_one_split(split_file, quasimap_dir) if self.clean: os.unlink(split_reads_file) logging.info(f"Finish quasimap on split file {split_file.filename}") return read_cov, build_report, quasimap_report
def _run_gramtools_with_split_vcf(self): logging.info('Splitting VCF files into chunks (if not already done)') chunker = vcf_chunker.VcfChunker( self.split_input_dir, vcf_infile=self.clustered_vcf, ref_fasta=self.ref_fasta, variants_per_split=self.variants_per_split, alleles_per_split=self.alleles_per_split, max_read_length=self.max_read_length, total_splits=self.total_splits, flank_length=self.max_read_length, gramtools_kmer_size=self.gramtools_kmer_size, ) chunker.make_split_files() self.gramtools_kmer_size = chunker.gramtools_kmer_size logging.info('VCF file split into ' + str(chunker.total_split_files) + ' chunks') try: os.mkdir(self.split_output_dir) except: raise Error('Error making output split directory ' + self.split_output_dir) unmapped_reads_file = os.path.join(self.split_output_dir, 'unmapped_reads.bam') bam_read_extract.get_unmapped_reads(self.reads_files[0], unmapped_reads_file) split_vcf_outfiles = {} split_vcf_outfiles_unfiltered = {} mean_depths = [] depth_variances = [] for ref_name, split_file_list in chunker.vcf_split_files.items(): split_vcf_outfiles[ref_name] = [] split_vcf_outfiles_unfiltered[ref_name] = [] for split_file in split_file_list: logging.info( '===== Start analysing variants in VCF split file ' + split_file.filename + ' =====') split_reads_file = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.reads.bam') bam_read_extract.get_region( self.reads_files[0], split_file.chrom, split_file.chrom_start, split_file.chrom_end, split_reads_file, ) gramtools_quasimap_dir = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.gramtools.quasimap') build_report, quasimap_report = gramtools.run_gramtools( split_file.gramtools_build_dir, gramtools_quasimap_dir, split_file.filename, self.ref_fasta, [unmapped_reads_file, split_reads_file], self.max_read_length, kmer_size=self.gramtools_kmer_size, ) logging.info('Loading split gramtools quasimap output files ' + gramtools_quasimap_dir) perl_generated_vcf = os.path.join( split_file.gramtools_build_dir, 'perl_generated_vcf') mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files( perl_generated_vcf, gramtools_quasimap_dir) mean_depths.append(mean_depth) depth_variances.append(depth_variance) logging.info('Finished loading gramtools files') if self.sample_name is None: sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines( vcf_header) else: sample_name = self.sample_name assert sample_name is not None split_vcf_out = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.out.vcf') unfiltered_vcf_out = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.out.debug.calls_with_zero_cov_alleles.vcf') logging.info('Writing VCf output file ' + split_vcf_out + ' for split VCF file ' + split_file.filename) gramtools.write_vcf_annotated_using_coverage_from_gramtools( mean_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, unfiltered_vcf_out, self.gramtools_kmer_size, sample_name=sample_name, max_read_length=self.max_read_length, filtered_outfile=split_vcf_out, ) split_vcf_outfiles[ref_name].append(split_vcf_out) split_vcf_outfiles_unfiltered[ref_name].append( unfiltered_vcf_out) if self.clean: logging.info( 'Cleaning gramtools files from split VCF file ' + split_file.filename) if not self.user_supplied_gramtools_build_dir: os.rename( os.path.join(split_file.gramtools_build_dir, 'build_report.json'), split_file.gramtools_build_dir + '.report.json') shutil.rmtree(split_file.gramtools_build_dir) os.unlink(split_file.filename) os.rename( os.path.join(gramtools_quasimap_dir, 'report.json'), gramtools_quasimap_dir + '.report.json') shutil.rmtree(gramtools_quasimap_dir) os.unlink(split_reads_file) logging.info( '===== Finish analysing variants in VCF split file ' + split_file.filename + ' =====') logging.info('Merging VCF files into one output file ' + self.final_vcf) chunker.merge_files(split_vcf_outfiles, self.final_vcf) chunker.merge_files(split_vcf_outfiles_unfiltered, self.unfiltered_vcf_file) mean_depth = statistics.mean(mean_depths) depth_variance = statistics.mean(depth_variances) logging.info( f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations' ) Adjudicator._add_gt_conf_percentile_to_vcf_file( self.final_vcf, mean_depth, depth_variance, self.read_error_rate, self.genotype_simulation_iterations) if self.clean: logging.info('Deleting temp split VCF files') for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered: for file_list in d.values(): for filename in file_list: os.unlink(filename) os.unlink(unmapped_reads_file)