Пример #1
0
    def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_fastq_file(
        self, ):
        """test estimate_max_read_length_and_read_error_rate_from_qual_scores fastq file"""
        tmp_file = (
            "tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.fq"
        )
        with open(tmp_file, "w") as f:
            print("@1", "ACGT", "+", "IIHH", sep="\n", file=f)
            print("@2", "ACGTAG", "+", "IHGGFF", sep="\n", file=f)

        (
            got_length,
            got_qual,
        ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
            tmp_file, number_of_reads=1)
        expect_qual = pow(10, -39.5 / 10)
        self.assertAlmostEqual(expect_qual, got_qual)
        self.assertEqual(4, got_length)
        (
            got_length,
            got_qual,
        ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
            tmp_file, number_of_reads=2)
        expect_qual = pow(10, -38.7 / 10)
        self.assertAlmostEqual(expect_qual, got_qual)
        self.assertEqual(6, got_length)
        os.unlink(tmp_file)
Пример #2
0
    def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_sam_file(
        self, ):
        """test estimate_max_read_length_and_read_error_rate_from_qual_scores sam file"""
        tmp_file = (
            "tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.sam"
        )
        with open(tmp_file, "w") as f:
            print("@SQ\tSN:ref\tLN:1000", file=f)
            print(1,
                  0,
                  "ref",
                  42,
                  43,
                  "4M",
                  "*",
                  0,
                  0,
                  "ACGT",
                  "IIHH",
                  sep="\t",
                  file=f)
            print(2,
                  0,
                  "ref",
                  42,
                  43,
                  "4M",
                  "*",
                  0,
                  0,
                  "ACGT",
                  "GGFF",
                  sep="\t",
                  file=f)

        (
            got_length,
            got_qual,
        ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
            tmp_file, number_of_reads=1)
        expect_qual = pow(10, -39.5 / 10)
        self.assertAlmostEqual(expect_qual, got_qual)
        self.assertEqual(4, got_length)
        (
            got_length,
            got_qual,
        ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
            tmp_file, number_of_reads=2)
        expect_qual = pow(10, -38.5 / 10)
        self.assertAlmostEqual(expect_qual, got_qual)
        self.assertEqual(4, got_length)
        os.unlink(tmp_file)
Пример #3
0
    def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_sam_file_no_quals(
        self, ):
        """test estimate_max_read_length_and_read_error_rate_from_qual_scores sam file with no quals"""
        tmp_file = (
            "tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.sam"
        )
        with open(tmp_file, "w") as f:
            print("@SQ\tSN:ref\tLN:1000", file=f)
            print(1,
                  0,
                  "ref",
                  42,
                  43,
                  "4M",
                  "*",
                  0,
                  0,
                  "ACGT",
                  "*",
                  sep="\t",
                  file=f)
            print(2,
                  0,
                  "ref",
                  42,
                  43,
                  "5M",
                  "*",
                  0,
                  0,
                  "ACGTA",
                  "*",
                  sep="\t",
                  file=f)

        (
            got_length,
            got_qual,
        ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
            tmp_file, number_of_reads=1)
        self.assertEqual(None, got_qual)
        self.assertEqual(4, got_length)
        (
            got_length,
            got_qual,
        ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
            tmp_file, number_of_reads=2)
        self.assertEqual(None, got_qual)
        self.assertEqual(5, got_length)
        os.unlink(tmp_file)
Пример #4
0
    def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_sam_file(
            self):
        '''test estimate_max_read_length_and_read_error_rate_from_qual_scores sam file'''
        tmp_file = 'tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.sam'
        with open(tmp_file, 'w') as f:
            print('@SQ\tSN:ref\tLN:1000', file=f)
            print(1,
                  0,
                  'ref',
                  42,
                  43,
                  '4M',
                  '*',
                  0,
                  0,
                  'ACGT',
                  'IIHH',
                  sep='\t',
                  file=f)
            print(2,
                  0,
                  'ref',
                  42,
                  43,
                  '4M',
                  '*',
                  0,
                  0,
                  'ACGT',
                  'GGFF',
                  sep='\t',
                  file=f)

        got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
            tmp_file, number_of_reads=1)
        expect_qual = pow(10, -39.5 / 10)
        self.assertAlmostEqual(expect_qual, got_qual)
        self.assertEqual(4, got_length)
        got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
            tmp_file, number_of_reads=2)
        expect_qual = pow(10, -38.5 / 10)
        self.assertAlmostEqual(expect_qual, got_qual)
        self.assertEqual(4, got_length)
        os.unlink(tmp_file)
Пример #5
0
    def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_fastq_file(
            self):
        '''test estimate_max_read_length_and_read_error_rate_from_qual_scores fastq file'''
        tmp_file = 'tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.fq'
        with open(tmp_file, 'w') as f:
            print('@1', 'ACGT', '+', 'IIHH', sep='\n', file=f)
            print('@2', 'ACGTAG', '+', 'IHGGFF', sep='\n', file=f)

        got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
            tmp_file, number_of_reads=1)
        expect_qual = pow(10, -39.5 / 10)
        self.assertAlmostEqual(expect_qual, got_qual)
        self.assertEqual(4, got_length)
        got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
            tmp_file, number_of_reads=2)
        expect_qual = pow(10, -38.7 / 10)
        self.assertAlmostEqual(expect_qual, got_qual)
        self.assertEqual(6, got_length)
        os.unlink(tmp_file)
Пример #6
0
    def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_sam_file_no_quals(
            self):
        '''test estimate_max_read_length_and_read_error_rate_from_qual_scores sam file with no quals'''
        tmp_file = 'tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.sam'
        with open(tmp_file, 'w') as f:
            print('@SQ\tSN:ref\tLN:1000', file=f)
            print(1,
                  0,
                  'ref',
                  42,
                  43,
                  '4M',
                  '*',
                  0,
                  0,
                  'ACGT',
                  '*',
                  sep='\t',
                  file=f)
            print(2,
                  0,
                  'ref',
                  42,
                  43,
                  '5M',
                  '*',
                  0,
                  0,
                  'ACGTA',
                  '*',
                  sep='\t',
                  file=f)

        got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
            tmp_file, number_of_reads=1)
        self.assertEqual(None, got_qual)
        self.assertEqual(4, got_length)
        got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
            tmp_file, number_of_reads=2)
        self.assertEqual(None, got_qual)
        self.assertEqual(5, got_length)
        os.unlink(tmp_file)
Пример #7
0
    def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_fasta_file(
            self):
        '''test estimate_max_read_length_and_read_error_rate_from_qual_scores fasta file'''
        tmp_file = 'tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.fa'
        with open(tmp_file, 'w') as f:
            print('>1', 'ACGT', sep='\n', file=f)
            print('>2', 'ACGT', sep='\n', file=f)

        got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
            tmp_file)
        self.assertEqual(None, got_qual)
        self.assertEqual(4, got_length)
        os.unlink(tmp_file)
Пример #8
0
    def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_fasta_file(
            self):
        """test estimate_max_read_length_and_read_error_rate_from_qual_scores fasta file"""
        tmp_file = (
            "tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.fa"
        )
        with open(tmp_file, "w") as f:
            print(">1", "ACGT", sep="\n", file=f)
            print(">2", "ACGT", sep="\n", file=f)

        got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
            tmp_file)
        self.assertEqual(None, got_qual)
        self.assertEqual(4, got_length)
        os.unlink(tmp_file)
Пример #9
0
    def run(self):
        if os.path.exists(self.outdir) and self.overwrite_outdir:
            shutil.rmtree(self.outdir)

        try:
            os.mkdir(self.outdir)
        except:
            raise Error('Error making output directory ' + self.outdir)

        fh = logging.FileHandler(self.log_file, mode='w')
        log = logging.getLogger()
        formatter = logging.Formatter(
            '[minos %(asctime)s %(levelname)s] %(message)s',
            datefmt='%d-%m-%Y %H:%M:%S')
        fh.setFormatter(formatter)
        log.addHandler(fh)
        logging.info('Command run: ' + ' '.join(sys.argv))
        dependencies.check_and_report_dependencies(programs=['gramtools'])
        logging.info('Dependencies look OK')

        if self.read_error_rate is None or self.max_read_length is None:
            logging.info(
                'One or both of read_error_rate and max_read_length not known. Estimate from first 10,000 reads...'
            )
            estimated_read_length, estimated_read_error_rate = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
                self.reads_files[0])
            logging.info('Estimated max_read_length=' +
                         str(estimated_read_length) + ' and read_error_rate=' +
                         str(estimated_read_error_rate))

        self.read_error_rate = estimated_read_error_rate if self.read_error_rate is None else self.read_error_rate
        self.max_read_length = estimated_read_length if self.max_read_length is None else self.max_read_length
        logging.info('Using max_read_length=' + str(self.max_read_length) +
                     ' and read_error_rate=' + str(self.read_error_rate))

        if self.user_supplied_gramtools_build_dir:
            logging.info(
                'User supplied gramtools build dir. Assuming VCF already clustered, so skipping clustering'
            )
            assert len(self.vcf_files) == 1
            self.clustered_vcf = self.vcf_files[0]
        else:
            logging.info(
                'Clustering VCF file(s), to make one VCF input file for gramtools'
            )
            clusterer = vcf_clusterer.VcfClusterer(
                self.vcf_files,
                self.ref_fasta,
                self.clustered_vcf,
                max_distance_between_variants=1,
                max_alleles_per_cluster=self.max_alleles_per_cluster,
            )
            clusterer.run()

            logging.info('Finished clustering VCF file(s)')

        if not vcf_file_read.vcf_file_has_at_least_one_record(
                self.clustered_vcf):
            error_message = 'No VCF records. Cannot continue. Please check that the input VCF files contained at least one variant'
            logging.error(error_message)
            raise Error(error_message)

        if self.total_splits is not None or self.variants_per_split is not None or self.alleles_per_split is not None or os.path.exists(
                os.path.join(self.split_input_dir, 'data.pickle')):
            self._run_gramtools_with_split_vcf()
        else:
            self._run_gramtools_not_split_vcf()

        logging.info('Making plots from final.vcf')
        plots.plots_from_minos_vcf(self.final_vcf, self.plots_prefix)

        logging.info('All done! Thank you for using minos :)')
Пример #10
0
    def run(self):
        self.build_output_dir()

        fh = logging.FileHandler(self.log_file, mode="w")
        log = logging.getLogger()
        formatter = logging.Formatter(
            "[minos %(asctime)s %(levelname)s] %(message)s",
            datefmt="%d-%m-%Y %H:%M:%S")
        fh.setFormatter(formatter)
        log.addHandler(fh)
        logging.info("Command run: " + " ".join(sys.argv))
        to_check = [
            "gramtools",
            "vcfbreakmulti",
            "vcfallelicprimitives",
            "vcfuniq",
            "vt",
        ]
        dependencies.check_and_report_dependencies(programs=to_check)
        logging.info("Dependencies look OK")

        self.ref_fasta = os.path.join(self.outdir, "ref.fa")
        utils.fasta_to_upper_and_ACGT_only(self.original_ref_fasta,
                                           self.ref_fasta)

        if self.read_error_rate is None:
            logging.info(
                "read_error_rate unknown. Estimate from first 10,000 reads...")
            (
                estimated_read_length,
                estimated_read_error_rate,
            ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
                self.reads_files[0])
            logging.info(
                f"Estimated read_error_rate={estimated_read_error_rate}")

            self.read_error_rate = (estimated_read_error_rate
                                    if self.read_error_rate is None else
                                    self.read_error_rate)
            logging.info(f"Using read_error_rate={self.read_error_rate}")

        if self.user_supplied_gramtools_build_dir:
            logging.info(
                "User supplied gramtools build dir. Assuming VCF already clustered, so skipping clustering"
            )
            assert len(self.vcf_files) == 1
            self.clustered_vcf = self.vcf_files[0]
        elif not self.cluster_input_vcfs:
            logging.info(
                "Skipping VCF clustering because user requested to skip")
        else:
            logging.info(
                "Clustering VCF file(s), to make one VCF input file for gramtools"
            )
            tracker = variant_tracking.VariantTracker(self.cluster_dir,
                                                      self.ref_fasta)
            tracker.merge_vcf_files(self.vcf_files)
            tracker.cluster(self.clustered_vcf_prefix,
                            float("Inf"),
                            max_alleles=5000)
            if not self.debug:
                os.unlink(f"{self.clustered_vcf_prefix}.excluded.tsv")
                utils.rm_rf(self.cluster_dir)
            logging.info("Finished clustering VCF file(s)")

        if not vcf_file_read.vcf_file_has_at_least_one_record(
                self.clustered_vcf):
            error_message = "No VCF records. Cannot continue. Please check that the input VCF files contained at least one variant"
            logging.error(error_message)
            raise Exception(error_message)

        if (self.total_splits is not None
                or self.variants_per_split is not None
                or self.alleles_per_split is not None or os.path.exists(
                    os.path.join(self.split_input_dir, "data.pickle"))):
            self._run_gramtools_with_split_vcf()
        else:
            self._run_gramtools_not_split_vcf()

        logging.info("All done! Thank you for using minos :)")
Пример #11
0
    def run(self):
        self.build_output_dir()

        fh = logging.FileHandler(self.log_file, mode="w")
        log = logging.getLogger()
        formatter = logging.Formatter(
            "[minos %(asctime)s %(levelname)s] %(message)s",
            datefmt="%d-%m-%Y %H:%M:%S")
        fh.setFormatter(formatter)
        log.addHandler(fh)
        logging.info("Command run: " + " ".join(sys.argv))
        dependencies.check_and_report_dependencies(programs=["gramtools"])
        logging.info("Dependencies look OK")

        if self.read_error_rate is None or self.max_read_length is None:
            logging.info(
                "One or both of read_error_rate and max_read_length not known. Estimate from first 10,000 reads..."
            )
            estimated_read_length, estimated_read_error_rate = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
                self.reads_files[0])
            logging.info("Estimated max_read_length=" +
                         str(estimated_read_length) + " and read_error_rate=" +
                         str(estimated_read_error_rate))

            self.read_error_rate = (estimated_read_error_rate
                                    if self.read_error_rate is None else
                                    self.read_error_rate)
            self.max_read_length = (estimated_read_length
                                    if self.max_read_length is None else
                                    self.max_read_length)
        logging.info("Using max_read_length=" + str(self.max_read_length) +
                     " and read_error_rate=" + str(self.read_error_rate))

        if self.user_supplied_gramtools_build_dir:
            logging.info(
                "User supplied gramtools build dir. Assuming VCF already clustered, so skipping clustering"
            )
            assert len(self.vcf_files) == 1
            self.clustered_vcf = self.vcf_files[0]
        else:
            logging.info(
                "Clustering VCF file(s), to make one VCF input file for gramtools"
            )
            clusterer = vcf_clusterer.VcfClusterer(
                self.vcf_files,
                self.ref_fasta,
                self.clustered_vcf,
                cluster_boundary_size=0,
                max_alleles_per_cluster=self.max_alleles_per_cluster,
            )
            clusterer.run()

            logging.info("Finished clustering VCF file(s)")

        if not vcf_file_read.vcf_file_has_at_least_one_record(
                self.clustered_vcf):
            error_message = "No VCF records. Cannot continue. Please check that the input VCF files contained at least one variant"
            logging.error(error_message)
            raise Exception(error_message)

        if (self.total_splits is not None
                or self.variants_per_split is not None
                or self.alleles_per_split is not None or os.path.exists(
                    os.path.join(self.split_input_dir, "data.pickle"))):
            self._run_gramtools_with_split_vcf()
        else:
            self._run_gramtools_not_split_vcf()

        logging.info("Making plots from final.vcf")
        plots.plots_from_minos_vcf(self.final_vcf, self.plots_prefix)

        logging.info("All done! Thank you for using minos :)")