示例#1
0
 def test_run_gramtools_fails(self):
     """test run_gramtools when fails"""
     # Don't trust error code. Instead, we check
     # that gramtools wrote the quesimap files we expected, as this
     # is a good proxy for success. One way to stop these files
     # from being written is to have no variants in the input VCF,
     # so that's what we do here
     tmp_out_build = "tmp.run_gramtools.fail.out.build"
     tmp_out_quasimap = "tmp.run_gramtools.fail.out.quasimap"
     if os.path.exists(tmp_out_build):
         shutil.rmtree(tmp_out_build)
     if os.path.exists(tmp_out_quasimap):
         shutil.rmtree(tmp_out_quasimap)
     vcf_file = os.path.join(data_dir, "run_gramtools.empty.vcf")
     ref_file = os.path.join(data_dir, "run_gramtools.ref.fa")
     reads_file = os.path.join(data_dir, "run_gramtools.reads.fq")
     with self.assertRaises(Exception):
         gramtools.run_gramtools(
             tmp_out_build,
             tmp_out_quasimap,
             vcf_file,
             ref_file,
             reads_file,
             150,
             kmer_size=5,
         )
     shutil.rmtree(tmp_out_build)
示例#2
0
    def _run_gramtools_not_split_vcf(self):
        self.gramtools_kmer_size = Adjudicator._get_gramtools_kmer_size(
            self.gramtools_build_dir, self.gramtools_kmer_size)
        build_report, quasimap_report = gramtools.run_gramtools(
            self.gramtools_build_dir,
            self.gramtools_quasimap_dir,
            self.clustered_vcf,
            self.ref_fasta,
            self.reads_files,
            self.max_read_length,
            kmer_size=self.gramtools_kmer_size,
        )

        logging.info('Loading gramtools quasimap output files ' +
                     self.gramtools_quasimap_dir)
        mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files(
            self.perl_generated_vcf, self.gramtools_quasimap_dir)
        logging.info('Finished loading gramtools files')
        if self.sample_name is None:
            sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(
                vcf_header)
        else:
            sample_name = self.sample_name
        assert sample_name is not None
        logging.info('Writing VCf output file ' + self.final_vcf)
        gramtools.write_vcf_annotated_using_coverage_from_gramtools(
            mean_depth,
            vcf_records,
            allele_coverage,
            allele_groups,
            self.read_error_rate,
            self.unfiltered_vcf_file,
            self.gramtools_kmer_size,
            sample_name=sample_name,
            max_read_length=self.max_read_length,
            filtered_outfile=self.final_vcf)

        logging.info(
            f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations'
        )
        Adjudicator._add_gt_conf_percentile_to_vcf_file(
            self.final_vcf, mean_depth, depth_variance, self.read_error_rate,
            self.genotype_simulation_iterations)

        if self.clean:
            os.rename(
                os.path.join(self.gramtools_quasimap_dir, 'report.json'),
                os.path.join(self.outdir, 'gramtools.quasimap.report.json'))
            shutil.rmtree(self.gramtools_quasimap_dir)

            if not self.user_supplied_gramtools_build_dir:
                os.rename(
                    os.path.join(self.gramtools_build_dir,
                                 'build_report.json'),
                    os.path.join(self.outdir, 'gramtools.build.report.json'))
                shutil.rmtree(self.gramtools_build_dir)
示例#3
0
    def run_adjudicate(self, build_dir, quasimap_dir, vcf, reads_files,
                       final_vcf, debug_vcf):
        build_report, quasimap_report = gramtools.run_gramtools(
            build_dir,
            quasimap_dir,
            vcf,
            self.ref_fasta,
            reads_files,
            kmer_size=self.gramtools_kmer_size,
        )

        build_vcf = os.path.join(build_dir, "build.vcf")

        logging.info("Loading gramtools quasimap output files " + quasimap_dir)
        (
            self.mean_depth,
            self.variance_depth,
            vcf_header,
            vcf_records,
            allele_coverage,
            allele_groups,
        ) = gramtools.load_gramtools_vcf_and_allele_coverage_files(
            build_vcf, quasimap_dir)

        logging.info("Finished loading gramtools files")

        if self.clean:
            os.rename(
                os.path.join(quasimap_dir, "quasimap_outputs",
                             "quasimap_report.json"),
                self.gramtools_quasimap_json,
            )
            shutil.rmtree(quasimap_dir)

            if not self.user_supplied_gramtools_build_dir:
                os.rename(
                    os.path.join(build_dir, "build_report.json"),
                    self.gramtools_build_json,
                )
                shutil.rmtree(build_dir)

        gramtools.write_vcf_annotated_using_coverage_from_gramtools(
            self.mean_depth,
            self.variance_depth,
            vcf_records,
            allele_coverage,
            allele_groups,
            self.read_error_rate,
            debug_vcf,
            sample_name=self.sample_name,
            filtered_outfile=final_vcf,
            ref_seq_lengths=self.ref_seq_lengths,
            call_hets=self.call_hets,
        )
示例#4
0
    def run_adjudicate(self, build_dir, quasimap_dir, vcf, reads_files,
                       final_vcf, debug_vcf):
        build_report, quasimap_report = gramtools.run_gramtools(
            build_dir,
            quasimap_dir,
            vcf,
            self.ref_fasta,
            reads_files,
            self.max_read_length,
            kmer_size=self.gramtools_kmer_size,
        )

        build_vcf = os.path.join(build_dir, "build.vcf")

        logging.info("Loading gramtools quasimap output files " + quasimap_dir)
        mean_depth, variance_depth, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files(
            build_vcf, quasimap_dir)
        Adjudicator.mean_depths.append(mean_depth)
        Adjudicator.variance_depths.append(variance_depth)

        logging.info("Finished loading gramtools files")

        if self.clean:
            os.rename(
                os.path.join(quasimap_dir, "quasimap_outputs",
                             "quasimap_report.json"),
                quasimap_dir + ".report.json",
            )
            shutil.rmtree(quasimap_dir)

            if not self.user_supplied_gramtools_build_dir:
                os.rename(
                    os.path.join(build_dir, "build_report.json"),
                    os.path.join(build_dir, "build.report.json"),
                )
                shutil.rmtree(build_dir)

        if self.sample_name is None:
            sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(
                vcf_header)
        else:
            sample_name = self.sample_name

        gramtools.write_vcf_annotated_using_coverage_from_gramtools(
            mean_depth,
            vcf_records,
            allele_coverage,
            allele_groups,
            self.read_error_rate,
            debug_vcf,
            sample_name=sample_name,
            max_read_length=self.max_read_length,
            filtered_outfile=final_vcf,
        )
示例#5
0
 def test_run_gramtools_two_reads_files(self):
     """test run_gramtools"""
     tmp_out_build = "tmp.run_gramtools.2files.out.build"
     tmp_out_quasimap = "tmp.run_gramtools.2files.out.quasimap"
     if os.path.exists(tmp_out_build):
         shutil.rmtree(tmp_out_build)
     if os.path.exists(tmp_out_quasimap):
         shutil.rmtree(tmp_out_quasimap)
     vcf_file = os.path.join(data_dir, "run_gramtools.calls.vcf")
     ref_file = os.path.join(data_dir, "run_gramtools.ref.fa")
     reads_file1 = os.path.join(data_dir, "run_gramtools.reads_1.fq")
     reads_file2 = os.path.join(data_dir, "run_gramtools.reads_2.fq")
     gramtools.run_gramtools(
         tmp_out_build,
         tmp_out_quasimap,
         vcf_file,
         ref_file,
         [reads_file1, reads_file2],
         150,
         kmer_size=5,
     )
     # We're trusing gramtools output for this test. The point here is to check
     # that gramtools can run. Parsing its output is checked elsewhere.
     self.assertTrue(os.path.exists(tmp_out_build))
     self.assertTrue(os.path.exists(tmp_out_quasimap))
     self.assertTrue(
         os.path.exists(
             os.path.join(tmp_out_quasimap, "quasimap_outputs",
                          "allele_base_coverage.json")))
     self.assertTrue(
         os.path.exists(
             os.path.join(
                 tmp_out_quasimap,
                 "quasimap_outputs",
                 "grouped_allele_counts_coverage.json",
             )))
     shutil.rmtree(tmp_out_build)
     shutil.rmtree(tmp_out_quasimap)
示例#6
0
    def _run_quasimap_one_split(self, split_file, unmapped_reads_file=None):
        logging.info(f"Start quasimap on split file {split_file.filename}")
        split_reads_file = os.path.join(
            self.split_output_dir,
            f"split.{split_file.file_number}.reads.bam",
        )
        bam_read_extract.get_region(
            self.reads_files[0],
            split_file.chrom,
            split_file.chrom_start,
            split_file.chrom_end,
            split_reads_file,
        )

        quasimap_dir = os.path.join(
            self.split_output_dir,
            f"split.{split_file.file_number}.gramtools.quasimap",
        )
        if self.use_unmapped_reads:
            reads_files = [unmapped_reads_file, split_reads_file]
        else:
            reads_files = [split_reads_file]

        build_report, quasimap_report = gramtools.run_gramtools(
            split_file.gramtools_build_dir,
            quasimap_dir,
            split_file.filename,
            self.ref_fasta,
            reads_files,
            kmer_size=self.gramtools_kmer_size,
        )
        read_cov = self._get_read_coverage_one_split(split_file, quasimap_dir)

        if self.clean:
            os.unlink(split_reads_file)

        logging.info(f"Finish quasimap on split file {split_file.filename}")
        return read_cov, build_report, quasimap_report
示例#7
0
    def _run_gramtools_with_split_vcf(self):
        logging.info('Splitting VCF files into chunks (if not already done)')
        chunker = vcf_chunker.VcfChunker(
            self.split_input_dir,
            vcf_infile=self.clustered_vcf,
            ref_fasta=self.ref_fasta,
            variants_per_split=self.variants_per_split,
            alleles_per_split=self.alleles_per_split,
            max_read_length=self.max_read_length,
            total_splits=self.total_splits,
            flank_length=self.max_read_length,
            gramtools_kmer_size=self.gramtools_kmer_size,
        )
        chunker.make_split_files()
        self.gramtools_kmer_size = chunker.gramtools_kmer_size

        logging.info('VCF file split into ' + str(chunker.total_split_files) +
                     ' chunks')
        try:
            os.mkdir(self.split_output_dir)
        except:
            raise Error('Error making output split directory ' +
                        self.split_output_dir)

        unmapped_reads_file = os.path.join(self.split_output_dir,
                                           'unmapped_reads.bam')
        bam_read_extract.get_unmapped_reads(self.reads_files[0],
                                            unmapped_reads_file)
        split_vcf_outfiles = {}
        split_vcf_outfiles_unfiltered = {}
        mean_depths = []
        depth_variances = []

        for ref_name, split_file_list in chunker.vcf_split_files.items():
            split_vcf_outfiles[ref_name] = []
            split_vcf_outfiles_unfiltered[ref_name] = []
            for split_file in split_file_list:
                logging.info(
                    '===== Start analysing variants in VCF split file ' +
                    split_file.filename + ' =====')
                split_reads_file = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) + '.reads.bam')
                bam_read_extract.get_region(
                    self.reads_files[0],
                    split_file.chrom,
                    split_file.chrom_start,
                    split_file.chrom_end,
                    split_reads_file,
                )

                gramtools_quasimap_dir = os.path.join(
                    self.split_output_dir, 'split.' +
                    str(split_file.file_number) + '.gramtools.quasimap')

                build_report, quasimap_report = gramtools.run_gramtools(
                    split_file.gramtools_build_dir,
                    gramtools_quasimap_dir,
                    split_file.filename,
                    self.ref_fasta,
                    [unmapped_reads_file, split_reads_file],
                    self.max_read_length,
                    kmer_size=self.gramtools_kmer_size,
                )

                logging.info('Loading split gramtools quasimap output files ' +
                             gramtools_quasimap_dir)
                perl_generated_vcf = os.path.join(
                    split_file.gramtools_build_dir, 'perl_generated_vcf')
                mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files(
                    perl_generated_vcf, gramtools_quasimap_dir)
                mean_depths.append(mean_depth)
                depth_variances.append(depth_variance)
                logging.info('Finished loading gramtools files')
                if self.sample_name is None:
                    sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(
                        vcf_header)
                else:
                    sample_name = self.sample_name
                assert sample_name is not None
                split_vcf_out = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) + '.out.vcf')
                unfiltered_vcf_out = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) +
                    '.out.debug.calls_with_zero_cov_alleles.vcf')
                logging.info('Writing VCf output file ' + split_vcf_out +
                             ' for split VCF file ' + split_file.filename)
                gramtools.write_vcf_annotated_using_coverage_from_gramtools(
                    mean_depth,
                    vcf_records,
                    allele_coverage,
                    allele_groups,
                    self.read_error_rate,
                    unfiltered_vcf_out,
                    self.gramtools_kmer_size,
                    sample_name=sample_name,
                    max_read_length=self.max_read_length,
                    filtered_outfile=split_vcf_out,
                )
                split_vcf_outfiles[ref_name].append(split_vcf_out)
                split_vcf_outfiles_unfiltered[ref_name].append(
                    unfiltered_vcf_out)

                if self.clean:
                    logging.info(
                        'Cleaning gramtools files from split VCF file ' +
                        split_file.filename)
                    if not self.user_supplied_gramtools_build_dir:
                        os.rename(
                            os.path.join(split_file.gramtools_build_dir,
                                         'build_report.json'),
                            split_file.gramtools_build_dir + '.report.json')
                        shutil.rmtree(split_file.gramtools_build_dir)
                        os.unlink(split_file.filename)

                    os.rename(
                        os.path.join(gramtools_quasimap_dir, 'report.json'),
                        gramtools_quasimap_dir + '.report.json')
                    shutil.rmtree(gramtools_quasimap_dir)
                    os.unlink(split_reads_file)

                logging.info(
                    '===== Finish analysing variants in VCF split file ' +
                    split_file.filename + ' =====')

        logging.info('Merging VCF files into one output file ' +
                     self.final_vcf)
        chunker.merge_files(split_vcf_outfiles, self.final_vcf)
        chunker.merge_files(split_vcf_outfiles_unfiltered,
                            self.unfiltered_vcf_file)

        mean_depth = statistics.mean(mean_depths)
        depth_variance = statistics.mean(depth_variances)
        logging.info(
            f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations'
        )
        Adjudicator._add_gt_conf_percentile_to_vcf_file(
            self.final_vcf, mean_depth, depth_variance, self.read_error_rate,
            self.genotype_simulation_iterations)

        if self.clean:
            logging.info('Deleting temp split VCF files')
            for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered:
                for file_list in d.values():
                    for filename in file_list:
                        os.unlink(filename)
            os.unlink(unmapped_reads_file)