Пример #1
0
    def test_make_split_files_2(self):
        """test make_split_files with different input from previous test"""
        # These records cause a minos bug. Last record was not being used
        # when merging because the index was wrong.
        # They are test data from multi_sample_pipeline tests
        infile = os.path.join(data_dir, "make_split_files2.in.vcf")
        tmp_out = "tmp.vcf_chunker.make_split_files2"
        ref_fa = os.path.join(data_dir, "make_split_files2.in.ref.fa")
        if os.path.exists(tmp_out):
            shutil.rmtree(tmp_out)

        chunker = vcf_chunker.VcfChunker(
            tmp_out,
            vcf_infile=infile,
            ref_fasta=ref_fa,
            variants_per_split=2,
            flank_length=200,
            gramtools_kmer_size=5,
        )
        chunker.make_split_files()
        self.assertTrue(os.path.exists(chunker.metadata_pickle))
        chunker2 = vcf_chunker.VcfChunker(tmp_out, gramtools_kmer_size=5)
        self.assertEqual(1, len(chunker2.vcf_split_files))
        self.assertEqual(3, len(chunker2.vcf_split_files["ref.0"]))
        self.assertEqual(4,
                         chunker2.vcf_split_files["ref.0"][-1].use_end_index)
        shutil.rmtree(tmp_out)

        # Test with two threads
        chunker = vcf_chunker.VcfChunker(
            tmp_out,
            vcf_infile=infile,
            ref_fasta=ref_fa,
            variants_per_split=2,
            flank_length=200,
            threads=2,
            gramtools_kmer_size=5,
        )
        chunker.make_split_files()
        self.assertTrue(os.path.exists(chunker.metadata_pickle))
        chunker2 = vcf_chunker.VcfChunker(tmp_out, gramtools_kmer_size=5)
        self.assertEqual(1, len(chunker2.vcf_split_files))
        self.assertEqual(3, len(chunker2.vcf_split_files["ref.0"]))
        self.assertEqual(4,
                         chunker2.vcf_split_files["ref.0"][-1].use_end_index)
        shutil.rmtree(tmp_out)
Пример #2
0
def run(options):
    chunker = vcf_chunker.VcfChunker(
        options.outdir,
        vcf_infile=options.vcf_file,
        ref_fasta=options.ref_fasta,
        variants_per_split=options.variants_per_split,
        alleles_per_split=options.alleles_per_split,
        total_splits=options.total_splits,
        flank_length=200,
        gramtools_kmer_size=options.gramtools_kmer_size,
        threads=options.threads,
    )
    chunker.make_split_files()
Пример #3
0
 def test_merge_files(self):
     '''test merge_files'''
     vcf_to_split = os.path.join(data_dir, 'merge_files.in.vcf')
     ref_fasta = os.path.join(data_dir, 'merge_files.in.ref.fa')
     tmp_outdir = 'tmp.vcf_chunker.merge_files'
     chunker = vcf_chunker.VcfChunker(tmp_outdir,
                                      vcf_infile=vcf_to_split,
                                      ref_fasta=ref_fasta,
                                      variants_per_split=4,
                                      flank_length=3,
                                      gramtools_kmer_size=5)
     chunker.make_split_files()
     to_merge = {}
     for ref, split_list in chunker.vcf_split_files.items():
         to_merge[ref] = [x.filename for x in split_list]
     tmp_vcf_out = 'tmp.vcf_chunker.merge_files.out.vcf'
     chunker.merge_files(to_merge, tmp_vcf_out)
     self.assertTrue(filecmp.cmp(vcf_to_split, tmp_vcf_out, shallow=False))
     os.unlink(tmp_vcf_out)
     shutil.rmtree(tmp_outdir)
Пример #4
0
    def test_make_split_files(self):
        '''test make_split_files'''
        infile = os.path.join(data_dir, 'make_split_files.in.vcf')
        tmp_out = 'tmp.vcf_chunker.make_split_files'
        ref_fa = os.path.join(data_dir, 'make_split_files.in.ref.fa')
        if os.path.exists(tmp_out):
            shutil.rmtree(tmp_out)

        vcf1 = cluster_vcf_records.vcf_record.VcfRecord(
            'ref1\t1\t.\tG\tT\t.\tPASS\t.\t.\t.')
        vcf2 = cluster_vcf_records.vcf_record.VcfRecord(
            'ref1\t2\t.\tC\tT\t.\tPASS\t.\t.\t.')
        vcf3 = cluster_vcf_records.vcf_record.VcfRecord(
            'ref1\t3\t.\tT\tA\t.\tPASS\t.\t.\t.')
        vcf4 = cluster_vcf_records.vcf_record.VcfRecord(
            'ref1\t5\t.\tAGAGTCACGTA\tG\t.\tPASS\t.\t.\t.')
        vcf5 = cluster_vcf_records.vcf_record.VcfRecord(
            'ref1\t18\t.\tA\tG\t.\tPASS\t.\t.\t.')
        vcf6 = cluster_vcf_records.vcf_record.VcfRecord(
            'ref1\t21\t.\tG\tT\t.\tPASS\t.\t.\t.')
        vcf7 = cluster_vcf_records.vcf_record.VcfRecord(
            'ref2\t42\t.\tC\tG\t.\tPASS\t.\t.\t.')
        header_lines = [
            '##header1', '##header2',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_name'
        ]

        chunker = vcf_chunker.VcfChunker(tmp_out,
                                         vcf_infile=infile,
                                         ref_fasta=ref_fa,
                                         variants_per_split=2,
                                         flank_length=1,
                                         gramtools_kmer_size=5)
        chunker.make_split_files()
        self.assertTrue(os.path.exists(chunker.metadata_pickle))

        got_header, got_records = cluster_vcf_records.vcf_file_read.vcf_file_to_list(
            os.path.join(tmp_out, 'split.0.in.vcf'))
        self.assertEqual(header_lines, got_header)
        self.assertEqual([vcf1, vcf2, vcf3], got_records)

        got_header, got_records = cluster_vcf_records.vcf_file_read.vcf_file_to_list(
            os.path.join(tmp_out, 'split.1.in.vcf'))
        self.assertEqual(header_lines, got_header)
        self.assertEqual([vcf2, vcf3, vcf4], got_records)

        got_header, got_records = cluster_vcf_records.vcf_file_read.vcf_file_to_list(
            os.path.join(tmp_out, 'split.2.in.vcf'))
        self.assertEqual(header_lines, got_header)
        self.assertEqual([vcf5, vcf6], got_records)

        got_header, got_records = cluster_vcf_records.vcf_file_read.vcf_file_to_list(
            os.path.join(tmp_out, 'split.3.in.vcf'))
        self.assertEqual(header_lines, got_header)
        self.assertEqual([vcf7], got_records)

        self.assertFalse(
            os.path.exists(os.path.join(tmp_out, 'split.4.in.vcf')))
        shutil.rmtree(tmp_out)

        chunker = vcf_chunker.VcfChunker(tmp_out,
                                         vcf_infile=infile,
                                         ref_fasta=ref_fa,
                                         variants_per_split=4,
                                         flank_length=3,
                                         gramtools_kmer_size=5)
        chunker.make_split_files()
        self.assertTrue(os.path.exists(chunker.metadata_pickle))

        got_header, got_records = cluster_vcf_records.vcf_file_read.vcf_file_to_list(
            os.path.join(tmp_out, 'split.0.in.vcf'))
        self.assertEqual(header_lines, got_header)
        self.assertEqual([vcf1, vcf2, vcf3, vcf4, vcf5], got_records)

        got_header, got_records = cluster_vcf_records.vcf_file_read.vcf_file_to_list(
            os.path.join(tmp_out, 'split.1.in.vcf'))
        self.assertEqual(header_lines, got_header)
        self.assertEqual([vcf4, vcf5, vcf6], got_records)

        got_header, got_records = cluster_vcf_records.vcf_file_read.vcf_file_to_list(
            os.path.join(tmp_out, 'split.2.in.vcf'))
        self.assertEqual(header_lines, got_header)
        self.assertEqual([vcf7], got_records)

        self.assertFalse(
            os.path.exists(os.path.join(tmp_out, 'split.3.in.vcf')))

        chunker2 = vcf_chunker.VcfChunker(tmp_out, gramtools_kmer_size=5)
        self.assertEqual(chunker.vcf_infile, chunker2.vcf_infile)
        self.assertEqual(chunker.ref_fasta, chunker2.ref_fasta)
        self.assertEqual(chunker.variants_per_split,
                         chunker2.variants_per_split)
        self.assertEqual(chunker.total_splits, chunker2.total_splits)
        self.assertEqual(chunker.flank_length, chunker2.flank_length)
        self.assertEqual(chunker.gramtools_kmer_size,
                         chunker2.gramtools_kmer_size)
        self.assertEqual(chunker.max_read_length, chunker2.max_read_length)
        self.assertEqual(chunker.total_split_files, chunker2.total_split_files)
        self.assertEqual(chunker.vcf_split_files, chunker2.vcf_split_files)
        shutil.rmtree(tmp_out)
Пример #5
0
    def _run_gramtools_with_split_vcf(self):
        logging.info('Splitting VCF files into chunks (if not already done)')
        chunker = vcf_chunker.VcfChunker(
            self.split_input_dir,
            vcf_infile=self.clustered_vcf,
            ref_fasta=self.ref_fasta,
            variants_per_split=self.variants_per_split,
            alleles_per_split=self.alleles_per_split,
            max_read_length=self.max_read_length,
            total_splits=self.total_splits,
            flank_length=self.max_read_length,
            gramtools_kmer_size=self.gramtools_kmer_size,
        )
        chunker.make_split_files()
        self.gramtools_kmer_size = chunker.gramtools_kmer_size

        logging.info('VCF file split into ' + str(chunker.total_split_files) +
                     ' chunks')
        try:
            os.mkdir(self.split_output_dir)
        except:
            raise Error('Error making output split directory ' +
                        self.split_output_dir)

        unmapped_reads_file = os.path.join(self.split_output_dir,
                                           'unmapped_reads.bam')
        bam_read_extract.get_unmapped_reads(self.reads_files[0],
                                            unmapped_reads_file)
        split_vcf_outfiles = {}
        split_vcf_outfiles_unfiltered = {}
        mean_depths = []
        depth_variances = []

        for ref_name, split_file_list in chunker.vcf_split_files.items():
            split_vcf_outfiles[ref_name] = []
            split_vcf_outfiles_unfiltered[ref_name] = []
            for split_file in split_file_list:
                logging.info(
                    '===== Start analysing variants in VCF split file ' +
                    split_file.filename + ' =====')
                split_reads_file = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) + '.reads.bam')
                bam_read_extract.get_region(
                    self.reads_files[0],
                    split_file.chrom,
                    split_file.chrom_start,
                    split_file.chrom_end,
                    split_reads_file,
                )

                gramtools_quasimap_dir = os.path.join(
                    self.split_output_dir, 'split.' +
                    str(split_file.file_number) + '.gramtools.quasimap')

                build_report, quasimap_report = gramtools.run_gramtools(
                    split_file.gramtools_build_dir,
                    gramtools_quasimap_dir,
                    split_file.filename,
                    self.ref_fasta,
                    [unmapped_reads_file, split_reads_file],
                    self.max_read_length,
                    kmer_size=self.gramtools_kmer_size,
                )

                logging.info('Loading split gramtools quasimap output files ' +
                             gramtools_quasimap_dir)
                perl_generated_vcf = os.path.join(
                    split_file.gramtools_build_dir, 'perl_generated_vcf')
                mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files(
                    perl_generated_vcf, gramtools_quasimap_dir)
                mean_depths.append(mean_depth)
                depth_variances.append(depth_variance)
                logging.info('Finished loading gramtools files')
                if self.sample_name is None:
                    sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(
                        vcf_header)
                else:
                    sample_name = self.sample_name
                assert sample_name is not None
                split_vcf_out = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) + '.out.vcf')
                unfiltered_vcf_out = os.path.join(
                    self.split_output_dir,
                    'split.' + str(split_file.file_number) +
                    '.out.debug.calls_with_zero_cov_alleles.vcf')
                logging.info('Writing VCf output file ' + split_vcf_out +
                             ' for split VCF file ' + split_file.filename)
                gramtools.write_vcf_annotated_using_coverage_from_gramtools(
                    mean_depth,
                    vcf_records,
                    allele_coverage,
                    allele_groups,
                    self.read_error_rate,
                    unfiltered_vcf_out,
                    self.gramtools_kmer_size,
                    sample_name=sample_name,
                    max_read_length=self.max_read_length,
                    filtered_outfile=split_vcf_out,
                )
                split_vcf_outfiles[ref_name].append(split_vcf_out)
                split_vcf_outfiles_unfiltered[ref_name].append(
                    unfiltered_vcf_out)

                if self.clean:
                    logging.info(
                        'Cleaning gramtools files from split VCF file ' +
                        split_file.filename)
                    if not self.user_supplied_gramtools_build_dir:
                        os.rename(
                            os.path.join(split_file.gramtools_build_dir,
                                         'build_report.json'),
                            split_file.gramtools_build_dir + '.report.json')
                        shutil.rmtree(split_file.gramtools_build_dir)
                        os.unlink(split_file.filename)

                    os.rename(
                        os.path.join(gramtools_quasimap_dir, 'report.json'),
                        gramtools_quasimap_dir + '.report.json')
                    shutil.rmtree(gramtools_quasimap_dir)
                    os.unlink(split_reads_file)

                logging.info(
                    '===== Finish analysing variants in VCF split file ' +
                    split_file.filename + ' =====')

        logging.info('Merging VCF files into one output file ' +
                     self.final_vcf)
        chunker.merge_files(split_vcf_outfiles, self.final_vcf)
        chunker.merge_files(split_vcf_outfiles_unfiltered,
                            self.unfiltered_vcf_file)

        mean_depth = statistics.mean(mean_depths)
        depth_variance = statistics.mean(depth_variances)
        logging.info(
            f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations'
        )
        Adjudicator._add_gt_conf_percentile_to_vcf_file(
            self.final_vcf, mean_depth, depth_variance, self.read_error_rate,
            self.genotype_simulation_iterations)

        if self.clean:
            logging.info('Deleting temp split VCF files')
            for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered:
                for file_list in d.values():
                    for filename in file_list:
                        os.unlink(filename)
            os.unlink(unmapped_reads_file)
Пример #6
0
    def _run_gramtools_with_split_vcf(self):
        logging.info("Splitting VCF files into chunks (if not already done)")
        chunker = vcf_chunker.VcfChunker(
            self.split_input_dir,
            vcf_infile=self.clustered_vcf,
            ref_fasta=self.ref_fasta,
            variants_per_split=self.variants_per_split,
            alleles_per_split=self.alleles_per_split,
            total_splits=self.total_splits,
            gramtools_kmer_size=self.gramtools_kmer_size,
        )
        chunker.make_split_files()
        self.gramtools_kmer_size = chunker.gramtools_kmer_size

        logging.info("VCF file split into " + str(chunker.total_split_files) +
                     " chunks")
        try:
            os.mkdir(self.split_output_dir)
        except:
            raise Exception("Error making output split directory " +
                            self.split_output_dir)

        if self.use_unmapped_reads:
            unmapped_reads_file = os.path.join(self.split_output_dir,
                                               "unmapped_reads.bam")
            bam_read_extract.get_unmapped_reads(self.reads_files[0],
                                                unmapped_reads_file)
        else:
            unmapped_reads_file = None

        read_coverage = []
        build_reports = {}
        quasimap_reports = {}

        # Run gramtools quasimap on each split. Get back the read depth
        # from each split, which we need to get the global read depth and
        # variance, to then use for genotyping
        for ref_name, split_file_list in chunker.vcf_split_files.items():
            for split_file in split_file_list:
                read_cov, build_report, quasimap_report = self._run_quasimap_one_split(
                    split_file, unmapped_reads_file)
                read_coverage.extend(read_cov)
                build_reports[split_file.file_number] = build_report
                quasimap_reports[split_file.file_number] = quasimap_report

        with open(self.gramtools_quasimap_json, "w") as f:
            json.dump(quasimap_reports, f, indent=2, sort_keys=True)
        if not self.user_supplied_gramtools_build_dir:
            with open(self.gramtools_build_json, "w") as f:
                json.dump(build_reports, f, indent=2, sort_keys=True)

        self.mean_depth = round(statistics.mean(read_coverage), 3)
        self.variance_depth = round(statistics.variance(read_coverage), 3)

        # Can now genotype each split VCF, using the global mean depth and variance
        split_vcf_outfiles = {}
        split_vcf_outfiles_unfiltered = {}
        for ref_name, split_file_list in chunker.vcf_split_files.items():
            split_vcf_outfiles[ref_name] = []
            split_vcf_outfiles_unfiltered[ref_name] = []
            for split_file in split_file_list:
                build_vcf = os.path.join(split_file.gramtools_build_dir,
                                         "build.vcf")
                quasimap_dir = os.path.join(
                    self.split_output_dir,
                    f"split.{split_file.file_number}.gramtools.quasimap",
                )
                logging.info(f"Loading gramtools quasimap output files " +
                             quasimap_dir)
                (
                    _,  # mean depth for this split, which we don't want
                    _,  # depth variance for this split, which we don't want
                    vcf_header,
                    vcf_records,
                    allele_coverage,
                    allele_groups,
                ) = gramtools.load_gramtools_vcf_and_allele_coverage_files(
                    build_vcf, quasimap_dir)
                logging.info("Finished loading gramtools files")

                if self.clean:
                    shutil.rmtree(quasimap_dir)

                vcf_prefix = os.path.join(
                    self.split_output_dir,
                    f"split.{split_file.file_number}.out",
                )
                split_vcf_out = f"{vcf_prefix}.vcf"
                unfiltered_vcf_out = (
                    f"{vcf_prefix}.debug.calls_with_zero_cov_alleles.vcf")

                gramtools.write_vcf_annotated_using_coverage_from_gramtools(
                    self.mean_depth,
                    self.variance_depth,
                    vcf_records,
                    allele_coverage,
                    allele_groups,
                    self.read_error_rate,
                    unfiltered_vcf_out,
                    sample_name=self.sample_name,
                    filtered_outfile=split_vcf_out,
                    ref_seq_lengths=self.ref_seq_lengths,
                    call_hets=self.call_hets,
                )

                split_vcf_outfiles[ref_name].append(split_vcf_out)
                split_vcf_outfiles_unfiltered[ref_name].append(
                    unfiltered_vcf_out)

        # We now have minos run on each split VCF. Merge into one VCF, then can
        # add gt conf and gcp to the merged VCF.
        logging.info("Merging VCF files into one output file " +
                     self.final_vcf)
        chunker.merge_files(split_vcf_outfiles, self.final_vcf)
        chunker.merge_files(split_vcf_outfiles_unfiltered,
                            self.unfiltered_vcf_file)
        self.run_gt_conf()

        if self.clean:
            logging.info("Deleting temp split VCF files")
            for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered:
                for file_list in d.values():
                    for filename in file_list:
                        os.unlink(filename)
            if self.use_unmapped_reads:
                os.unlink(unmapped_reads_file)
Пример #7
0
    def _run_gramtools_with_split_vcf(self):
        logging.info("Splitting VCF files into chunks (if not already done)")
        chunker = vcf_chunker.VcfChunker(
            self.split_input_dir,
            vcf_infile=self.clustered_vcf,
            ref_fasta=self.ref_fasta,
            variants_per_split=self.variants_per_split,
            alleles_per_split=self.alleles_per_split,
            max_read_length=self.max_read_length,
            total_splits=self.total_splits,
            flank_length=self.max_read_length,
            gramtools_kmer_size=self.gramtools_kmer_size,
        )
        chunker.make_split_files()
        self.gramtools_kmer_size = chunker.gramtools_kmer_size

        logging.info("VCF file split into " + str(chunker.total_split_files) +
                     " chunks")
        try:
            os.mkdir(self.split_output_dir)
        except:
            raise Exception("Error making output split directory " +
                            self.split_output_dir)

        if self.use_unmapped_reads:
            unmapped_reads_file = os.path.join(self.split_output_dir,
                                               "unmapped_reads.bam")
            bam_read_extract.get_unmapped_reads(self.reads_files[0],
                                                unmapped_reads_file)

        split_vcf_outfiles = {}
        split_vcf_outfiles_unfiltered = {}

        for ref_name, split_file_list in chunker.vcf_split_files.items():
            split_vcf_outfiles[ref_name] = []
            split_vcf_outfiles_unfiltered[ref_name] = []
            for split_file in split_file_list:
                logging.info(
                    "===== Start analysing variants in VCF split file " +
                    split_file.filename + " =====")
                split_reads_file = os.path.join(
                    self.split_output_dir,
                    "split." + str(split_file.file_number) + ".reads.bam",
                )
                bam_read_extract.get_region(
                    self.reads_files[0],
                    split_file.chrom,
                    split_file.chrom_start,
                    split_file.chrom_end,
                    split_reads_file,
                )

                gramtools_quasimap_dir = os.path.join(
                    self.split_output_dir,
                    "split." + str(split_file.file_number) +
                    ".gramtools.quasimap",
                )
                if self.use_unmapped_reads:
                    reads_files = [unmapped_reads_file, split_reads_file]
                else:
                    reads_files = [split_reads_file]

                split_vcf_out = os.path.join(
                    self.split_output_dir,
                    "split." + str(split_file.file_number) + ".out.vcf",
                )
                unfiltered_vcf_out = os.path.join(
                    self.split_output_dir,
                    "split." + str(split_file.file_number) +
                    ".out.debug.calls_with_zero_cov_alleles.vcf",
                )

                self.run_adjudicate(
                    split_file.gramtools_build_dir,
                    gramtools_quasimap_dir,
                    split_file.filename,
                    reads_files,
                    split_vcf_out,
                    unfiltered_vcf_out,
                )

                if self.clean:
                    os.unlink(split_reads_file)
                    if not self.user_supplied_gramtools_build_dir:
                        os.unlink(split_file.filename)

                split_vcf_outfiles[ref_name].append(split_vcf_out)
                split_vcf_outfiles_unfiltered[ref_name].append(
                    unfiltered_vcf_out)

                logging.info(
                    "===== Finish analysing variants in VCF split file " +
                    split_file.filename + " =====")

        logging.info("Merging VCF files into one output file " +
                     self.final_vcf)
        chunker.merge_files(split_vcf_outfiles, self.final_vcf)
        chunker.merge_files(split_vcf_outfiles_unfiltered,
                            self.unfiltered_vcf_file)

        self.run_gt_conf()

        if self.clean:
            logging.info("Deleting temp split VCF files")
            for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered:
                for file_list in d.values():
                    for filename in file_list:
                        os.unlink(filename)
            if self.use_unmapped_reads:
                os.unlink(unmapped_reads_file)