def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_fastq_file( self, ): """test estimate_max_read_length_and_read_error_rate_from_qual_scores fastq file""" tmp_file = ( "tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.fq" ) with open(tmp_file, "w") as f: print("@1", "ACGT", "+", "IIHH", sep="\n", file=f) print("@2", "ACGTAG", "+", "IHGGFF", sep="\n", file=f) ( got_length, got_qual, ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( tmp_file, number_of_reads=1) expect_qual = pow(10, -39.5 / 10) self.assertAlmostEqual(expect_qual, got_qual) self.assertEqual(4, got_length) ( got_length, got_qual, ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( tmp_file, number_of_reads=2) expect_qual = pow(10, -38.7 / 10) self.assertAlmostEqual(expect_qual, got_qual) self.assertEqual(6, got_length) os.unlink(tmp_file)
def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_sam_file( self, ): """test estimate_max_read_length_and_read_error_rate_from_qual_scores sam file""" tmp_file = ( "tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.sam" ) with open(tmp_file, "w") as f: print("@SQ\tSN:ref\tLN:1000", file=f) print(1, 0, "ref", 42, 43, "4M", "*", 0, 0, "ACGT", "IIHH", sep="\t", file=f) print(2, 0, "ref", 42, 43, "4M", "*", 0, 0, "ACGT", "GGFF", sep="\t", file=f) ( got_length, got_qual, ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( tmp_file, number_of_reads=1) expect_qual = pow(10, -39.5 / 10) self.assertAlmostEqual(expect_qual, got_qual) self.assertEqual(4, got_length) ( got_length, got_qual, ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( tmp_file, number_of_reads=2) expect_qual = pow(10, -38.5 / 10) self.assertAlmostEqual(expect_qual, got_qual) self.assertEqual(4, got_length) os.unlink(tmp_file)
def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_sam_file_no_quals( self, ): """test estimate_max_read_length_and_read_error_rate_from_qual_scores sam file with no quals""" tmp_file = ( "tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.sam" ) with open(tmp_file, "w") as f: print("@SQ\tSN:ref\tLN:1000", file=f) print(1, 0, "ref", 42, 43, "4M", "*", 0, 0, "ACGT", "*", sep="\t", file=f) print(2, 0, "ref", 42, 43, "5M", "*", 0, 0, "ACGTA", "*", sep="\t", file=f) ( got_length, got_qual, ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( tmp_file, number_of_reads=1) self.assertEqual(None, got_qual) self.assertEqual(4, got_length) ( got_length, got_qual, ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( tmp_file, number_of_reads=2) self.assertEqual(None, got_qual) self.assertEqual(5, got_length) os.unlink(tmp_file)
def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_sam_file( self): '''test estimate_max_read_length_and_read_error_rate_from_qual_scores sam file''' tmp_file = 'tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.sam' with open(tmp_file, 'w') as f: print('@SQ\tSN:ref\tLN:1000', file=f) print(1, 0, 'ref', 42, 43, '4M', '*', 0, 0, 'ACGT', 'IIHH', sep='\t', file=f) print(2, 0, 'ref', 42, 43, '4M', '*', 0, 0, 'ACGT', 'GGFF', sep='\t', file=f) got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( tmp_file, number_of_reads=1) expect_qual = pow(10, -39.5 / 10) self.assertAlmostEqual(expect_qual, got_qual) self.assertEqual(4, got_length) got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( tmp_file, number_of_reads=2) expect_qual = pow(10, -38.5 / 10) self.assertAlmostEqual(expect_qual, got_qual) self.assertEqual(4, got_length) os.unlink(tmp_file)
def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_fastq_file( self): '''test estimate_max_read_length_and_read_error_rate_from_qual_scores fastq file''' tmp_file = 'tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.fq' with open(tmp_file, 'w') as f: print('@1', 'ACGT', '+', 'IIHH', sep='\n', file=f) print('@2', 'ACGTAG', '+', 'IHGGFF', sep='\n', file=f) got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( tmp_file, number_of_reads=1) expect_qual = pow(10, -39.5 / 10) self.assertAlmostEqual(expect_qual, got_qual) self.assertEqual(4, got_length) got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( tmp_file, number_of_reads=2) expect_qual = pow(10, -38.7 / 10) self.assertAlmostEqual(expect_qual, got_qual) self.assertEqual(6, got_length) os.unlink(tmp_file)
def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_sam_file_no_quals( self): '''test estimate_max_read_length_and_read_error_rate_from_qual_scores sam file with no quals''' tmp_file = 'tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.sam' with open(tmp_file, 'w') as f: print('@SQ\tSN:ref\tLN:1000', file=f) print(1, 0, 'ref', 42, 43, '4M', '*', 0, 0, 'ACGT', '*', sep='\t', file=f) print(2, 0, 'ref', 42, 43, '5M', '*', 0, 0, 'ACGTA', '*', sep='\t', file=f) got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( tmp_file, number_of_reads=1) self.assertEqual(None, got_qual) self.assertEqual(4, got_length) got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( tmp_file, number_of_reads=2) self.assertEqual(None, got_qual) self.assertEqual(5, got_length) os.unlink(tmp_file)
def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_fasta_file( self): '''test estimate_max_read_length_and_read_error_rate_from_qual_scores fasta file''' tmp_file = 'tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.fa' with open(tmp_file, 'w') as f: print('>1', 'ACGT', sep='\n', file=f) print('>2', 'ACGT', sep='\n', file=f) got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( tmp_file) self.assertEqual(None, got_qual) self.assertEqual(4, got_length) os.unlink(tmp_file)
def test_estimate_max_read_length_and_read_error_rate_from_qual_scores_fasta_file( self): """test estimate_max_read_length_and_read_error_rate_from_qual_scores fasta file""" tmp_file = ( "tmp.estimate_max_read_length_and_read_error_rate_from_qual_scores.fa" ) with open(tmp_file, "w") as f: print(">1", "ACGT", sep="\n", file=f) print(">2", "ACGT", sep="\n", file=f) got_length, got_qual = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( tmp_file) self.assertEqual(None, got_qual) self.assertEqual(4, got_length) os.unlink(tmp_file)
def run(self): if os.path.exists(self.outdir) and self.overwrite_outdir: shutil.rmtree(self.outdir) try: os.mkdir(self.outdir) except: raise Error('Error making output directory ' + self.outdir) fh = logging.FileHandler(self.log_file, mode='w') log = logging.getLogger() formatter = logging.Formatter( '[minos %(asctime)s %(levelname)s] %(message)s', datefmt='%d-%m-%Y %H:%M:%S') fh.setFormatter(formatter) log.addHandler(fh) logging.info('Command run: ' + ' '.join(sys.argv)) dependencies.check_and_report_dependencies(programs=['gramtools']) logging.info('Dependencies look OK') if self.read_error_rate is None or self.max_read_length is None: logging.info( 'One or both of read_error_rate and max_read_length not known. Estimate from first 10,000 reads...' ) estimated_read_length, estimated_read_error_rate = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( self.reads_files[0]) logging.info('Estimated max_read_length=' + str(estimated_read_length) + ' and read_error_rate=' + str(estimated_read_error_rate)) self.read_error_rate = estimated_read_error_rate if self.read_error_rate is None else self.read_error_rate self.max_read_length = estimated_read_length if self.max_read_length is None else self.max_read_length logging.info('Using max_read_length=' + str(self.max_read_length) + ' and read_error_rate=' + str(self.read_error_rate)) if self.user_supplied_gramtools_build_dir: logging.info( 'User supplied gramtools build dir. Assuming VCF already clustered, so skipping clustering' ) assert len(self.vcf_files) == 1 self.clustered_vcf = self.vcf_files[0] else: logging.info( 'Clustering VCF file(s), to make one VCF input file for gramtools' ) clusterer = vcf_clusterer.VcfClusterer( self.vcf_files, self.ref_fasta, self.clustered_vcf, max_distance_between_variants=1, max_alleles_per_cluster=self.max_alleles_per_cluster, ) clusterer.run() logging.info('Finished clustering VCF file(s)') if not vcf_file_read.vcf_file_has_at_least_one_record( self.clustered_vcf): error_message = 'No VCF records. Cannot continue. Please check that the input VCF files contained at least one variant' logging.error(error_message) raise Error(error_message) if self.total_splits is not None or self.variants_per_split is not None or self.alleles_per_split is not None or os.path.exists( os.path.join(self.split_input_dir, 'data.pickle')): self._run_gramtools_with_split_vcf() else: self._run_gramtools_not_split_vcf() logging.info('Making plots from final.vcf') plots.plots_from_minos_vcf(self.final_vcf, self.plots_prefix) logging.info('All done! Thank you for using minos :)')
def run(self): self.build_output_dir() fh = logging.FileHandler(self.log_file, mode="w") log = logging.getLogger() formatter = logging.Formatter( "[minos %(asctime)s %(levelname)s] %(message)s", datefmt="%d-%m-%Y %H:%M:%S") fh.setFormatter(formatter) log.addHandler(fh) logging.info("Command run: " + " ".join(sys.argv)) to_check = [ "gramtools", "vcfbreakmulti", "vcfallelicprimitives", "vcfuniq", "vt", ] dependencies.check_and_report_dependencies(programs=to_check) logging.info("Dependencies look OK") self.ref_fasta = os.path.join(self.outdir, "ref.fa") utils.fasta_to_upper_and_ACGT_only(self.original_ref_fasta, self.ref_fasta) if self.read_error_rate is None: logging.info( "read_error_rate unknown. Estimate from first 10,000 reads...") ( estimated_read_length, estimated_read_error_rate, ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( self.reads_files[0]) logging.info( f"Estimated read_error_rate={estimated_read_error_rate}") self.read_error_rate = (estimated_read_error_rate if self.read_error_rate is None else self.read_error_rate) logging.info(f"Using read_error_rate={self.read_error_rate}") if self.user_supplied_gramtools_build_dir: logging.info( "User supplied gramtools build dir. Assuming VCF already clustered, so skipping clustering" ) assert len(self.vcf_files) == 1 self.clustered_vcf = self.vcf_files[0] elif not self.cluster_input_vcfs: logging.info( "Skipping VCF clustering because user requested to skip") else: logging.info( "Clustering VCF file(s), to make one VCF input file for gramtools" ) tracker = variant_tracking.VariantTracker(self.cluster_dir, self.ref_fasta) tracker.merge_vcf_files(self.vcf_files) tracker.cluster(self.clustered_vcf_prefix, float("Inf"), max_alleles=5000) if not self.debug: os.unlink(f"{self.clustered_vcf_prefix}.excluded.tsv") utils.rm_rf(self.cluster_dir) logging.info("Finished clustering VCF file(s)") if not vcf_file_read.vcf_file_has_at_least_one_record( self.clustered_vcf): error_message = "No VCF records. Cannot continue. Please check that the input VCF files contained at least one variant" logging.error(error_message) raise Exception(error_message) if (self.total_splits is not None or self.variants_per_split is not None or self.alleles_per_split is not None or os.path.exists( os.path.join(self.split_input_dir, "data.pickle"))): self._run_gramtools_with_split_vcf() else: self._run_gramtools_not_split_vcf() logging.info("All done! Thank you for using minos :)")
def run(self): self.build_output_dir() fh = logging.FileHandler(self.log_file, mode="w") log = logging.getLogger() formatter = logging.Formatter( "[minos %(asctime)s %(levelname)s] %(message)s", datefmt="%d-%m-%Y %H:%M:%S") fh.setFormatter(formatter) log.addHandler(fh) logging.info("Command run: " + " ".join(sys.argv)) dependencies.check_and_report_dependencies(programs=["gramtools"]) logging.info("Dependencies look OK") if self.read_error_rate is None or self.max_read_length is None: logging.info( "One or both of read_error_rate and max_read_length not known. Estimate from first 10,000 reads..." ) estimated_read_length, estimated_read_error_rate = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( self.reads_files[0]) logging.info("Estimated max_read_length=" + str(estimated_read_length) + " and read_error_rate=" + str(estimated_read_error_rate)) self.read_error_rate = (estimated_read_error_rate if self.read_error_rate is None else self.read_error_rate) self.max_read_length = (estimated_read_length if self.max_read_length is None else self.max_read_length) logging.info("Using max_read_length=" + str(self.max_read_length) + " and read_error_rate=" + str(self.read_error_rate)) if self.user_supplied_gramtools_build_dir: logging.info( "User supplied gramtools build dir. Assuming VCF already clustered, so skipping clustering" ) assert len(self.vcf_files) == 1 self.clustered_vcf = self.vcf_files[0] else: logging.info( "Clustering VCF file(s), to make one VCF input file for gramtools" ) clusterer = vcf_clusterer.VcfClusterer( self.vcf_files, self.ref_fasta, self.clustered_vcf, cluster_boundary_size=0, max_alleles_per_cluster=self.max_alleles_per_cluster, ) clusterer.run() logging.info("Finished clustering VCF file(s)") if not vcf_file_read.vcf_file_has_at_least_one_record( self.clustered_vcf): error_message = "No VCF records. Cannot continue. Please check that the input VCF files contained at least one variant" logging.error(error_message) raise Exception(error_message) if (self.total_splits is not None or self.variants_per_split is not None or self.alleles_per_split is not None or os.path.exists( os.path.join(self.split_input_dir, "data.pickle"))): self._run_gramtools_with_split_vcf() else: self._run_gramtools_not_split_vcf() logging.info("Making plots from final.vcf") plots.plots_from_minos_vcf(self.final_vcf, self.plots_prefix) logging.info("All done! Thank you for using minos :)")