def test_parse_manifest_file(): """test parse_manifest_file""" manifest_tsv = "tmp.parse_manifest_file.in.tsv" with open(manifest_tsv, "w") as f: vcf_prefix = os.path.join(data_dir, "parse_manifest_file") vcf1 = f"{vcf_prefix}.1.vcf" vcf2 = f"{vcf_prefix}.2.vcf" vcf3 = f"{vcf_prefix}.3.vcf" print("name", "vcf", "reads", sep="\t", file=f) print("sample1", vcf1, "1.reads.fq", sep="\t", file=f) print("sample2", vcf2, "2.reads.1.fq 2.reads.2.fq", sep="\t", file=f) print("sample3", vcf3, "3.reads.1.fq 3.reads.2.fq", sep="\t", file=f) merge_out = "tmp.parse_manifest_file.merge.fofn" adjudicate_out = "tmp.parse_manifest_file.adjudicate.tsv" ref_fasta = os.path.join(data_dir, "parse_manifest_file.ref.fa") utils.rm_rf(merge_out, adjudicate_out) regeno_helper.parse_manifest_file(manifest_tsv, merge_out, adjudicate_out, ref_fasta) os.unlink(manifest_tsv) expect_adj = os.path.join(data_dir, "parse_manifest_file.out.tsv") assert filecmp.cmp(expect_adj, adjudicate_out, shallow=False) os.unlink(adjudicate_out) with open(merge_out) as f: got_lines = [x.rstrip() for x in f] assert got_lines == [vcf1, vcf3] os.unlink(merge_out)
def test_fasta_to_upper_and_ACGT_only(self): """test fasta_to_upper_and_ACGT_only""" infile = os.path.join(data_dir, "fasta_to_upper_and_ACGT_only.in.fa") tmp_file = "tmp.fasta_to_upper_and_ACGT_only.fa" utils.rm_rf(tmp_file) utils.fasta_to_upper_and_ACGT_only(infile, tmp_file) expect = os.path.join(data_dir, "fasta_to_upper_and_ACGT_only.expect.fa") self.assertTrue(filecmp.cmp(tmp_file, expect, shallow=False)) os.unlink(tmp_file)
def test_compress_file(): """test compress_file""" vcf_in = os.path.join(data_dir, "compress_file.vcf") vcf_out = "tmp.compress_file.vcf.gz" txt_in = os.path.join(data_dir, "compress_file.txt") txt_out = "tmp.compress_file.txt.gz" utils.rm_rf(vcf_out, txt_out) regeno_helper.compress_file((vcf_in, vcf_out)) regeno_helper.compress_file((txt_in, txt_out)) assert os.path.exists(vcf_out) assert os.path.exists(txt_out) os.unlink(vcf_out) os.unlink(txt_out)
def test_distance_matrix_from_vcf_file(): vcf_file = os.path.join(data_dir, "distance_matrix_from_vcf_file.vcf") outfile = "tmp.distance_matrix_from_vcf_file.out" utils.rm_rf(outfile) dist_matrix.distance_matrix_from_vcf_file(vcf_file, outfile) expect = os.path.join(data_dir, "distance_matrix_from_vcf_file.expect.no_mask") assert filecmp.cmp(outfile, expect, shallow=False) os.unlink(outfile) mask_bed = os.path.join(data_dir, "distance_matrix_from_vcf_file.mask.bed") dist_matrix.distance_matrix_from_vcf_file(vcf_file, outfile, mask_bed_file=mask_bed) expect = os.path.join(data_dir, "distance_matrix_from_vcf_file.expect.mask") assert filecmp.cmp(outfile, expect, shallow=False) os.unlink(outfile)
def test_regenotype_pipeline(): outdir = "tmp.nextflow_regeno_test.out" utils.rm_rf(outdir) os.mkdir(outdir) manifest = "tmp.nextflow_regeno_test.tsv" _write_manifest(os.path.join(outdir, manifest)) regeno_nf = os.path.join(minos_dir, "nextflow", "regenotype.nf") regeno_config = os.path.join(minos_dir, "nextflow", "regenotype.config") dag = "tmp.nextflow_regeno_test.dag.pdf" ref_fasta = os.path.join(data_dir, "data.ref.fa") mask_bed = os.path.join(data_dir, "mask.bed") command = f"nextflow run -c {regeno_config} -profile tiny -with-dag {dag} {regeno_nf} --make_distance_matrix --mask_bed_file {mask_bed} --max_variants_per_sample 10 --ref_fasta {ref_fasta} --manifest {manifest} --outdir OUT" utils.syscall(command, cwd=outdir) expect_failed_samples = os.path.join(data_dir, "expect.failed_samples.txt") got_failed_samples = os.path.join(outdir, "OUT", "failed_samples.txt") assert filecmp.cmp(got_failed_samples, expect_failed_samples, shallow=False) expect_dist_matrix = os.path.join(data_dir, "expect.distance_matrix.txt") got_dist_matrix = os.path.join(outdir, "OUT", "distance_matrix.txt") assert filecmp.cmp(got_dist_matrix, expect_dist_matrix, shallow=False) # Don't know order of lines in the manifest tsv, or the filename that will # be given to each sample. We'll load in each VCF and check it matches the # sample name from the manifest. Also check info in json and tsv files # match manifest_json = os.path.join(outdir, "OUT", "manifest.json") assert os.path.exists(manifest_json) manifest_tsv = os.path.join(outdir, "OUT", "manifest.tsv") with open(manifest_json) as f: manifest_data = json.load(f) with open(manifest_tsv) as f: reader = csv.DictReader(f, delimiter="\t") for d in reader: vcf = os.path.join(outdir, "OUT", d["vcf_file"]) assert d["sample"] == vcf_file_read.get_sample_name_from_vcf_file( vcf) assert os.path.exists(os.path.join(outdir, "OUT", d["log_file"])) assert manifest_data[d["sample"]]["log_file"] == d["log_file"] assert manifest_data[d["sample"]]["vcf_file"] == d["vcf_file"] utils.rm_rf(outdir)
def test_make_per_sample_vcfs_dir(): """test make_per_sample_vcfs_dir""" manifest_file = "tmp.make_per_sample_vcfs_dir.tsv" indir = os.path.join(data_dir, "make_per_sample_vcfs_dir") minos_indirs = {} with open(manifest_file, "w") as f: for i in range(1, 6): minos_dir = os.path.join(indir, f"minos.{i}") print(f"sample.{i}", minos_dir, sep="\t", file=f) minos_indirs[f"sample.{i}"] = minos_dir root_out = "tmp.make_per_sample_vcfs_dir.out" utils.rm_rf(root_out) regeno_helper.make_per_sample_vcfs_dir(manifest_file, root_out, samples_per_dir=2, cpus=2) os.unlink(manifest_file) expect_tsv = os.path.join(data_dir, "make_per_sample_vcfs_dir.expect.tsv") got_tsv = os.path.join(root_out, "manifest.tsv") assert filecmp.cmp(expect_tsv, got_tsv, shallow=False) expect_json = os.path.join(data_dir, "make_per_sample_vcfs_dir.expect.json") got_json = os.path.join(root_out, "manifest.json") assert filecmp.cmp(expect_json, got_json, shallow=False) with open(got_json) as f: json_data = json.load(f) for sample, minos_dir in minos_indirs.items(): original_vcf = os.path.join(minos_dir, "debug.calls_with_zero_cov_alleles.vcf") original_log = os.path.join(minos_dir, "log.txt") new_vcf = os.path.join(root_out, json_data[sample]["vcf_file"]) new_log = os.path.join(root_out, json_data[sample]["log_file"]) assert _file_contents_the_same(original_vcf, new_vcf) assert _file_contents_the_same(original_log, new_log) utils.rm_rf(root_out)
def run(self): self.build_output_dir() fh = logging.FileHandler(self.log_file, mode="w") log = logging.getLogger() formatter = logging.Formatter( "[minos %(asctime)s %(levelname)s] %(message)s", datefmt="%d-%m-%Y %H:%M:%S") fh.setFormatter(formatter) log.addHandler(fh) logging.info("Command run: " + " ".join(sys.argv)) to_check = [ "gramtools", "vcfbreakmulti", "vcfallelicprimitives", "vcfuniq", "vt", ] dependencies.check_and_report_dependencies(programs=to_check) logging.info("Dependencies look OK") self.ref_fasta = os.path.join(self.outdir, "ref.fa") utils.fasta_to_upper_and_ACGT_only(self.original_ref_fasta, self.ref_fasta) if self.read_error_rate is None: logging.info( "read_error_rate unknown. Estimate from first 10,000 reads...") ( estimated_read_length, estimated_read_error_rate, ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores( self.reads_files[0]) logging.info( f"Estimated read_error_rate={estimated_read_error_rate}") self.read_error_rate = (estimated_read_error_rate if self.read_error_rate is None else self.read_error_rate) logging.info(f"Using read_error_rate={self.read_error_rate}") if self.user_supplied_gramtools_build_dir: logging.info( "User supplied gramtools build dir. Assuming VCF already clustered, so skipping clustering" ) assert len(self.vcf_files) == 1 self.clustered_vcf = self.vcf_files[0] elif not self.cluster_input_vcfs: logging.info( "Skipping VCF clustering because user requested to skip") else: logging.info( "Clustering VCF file(s), to make one VCF input file for gramtools" ) tracker = variant_tracking.VariantTracker(self.cluster_dir, self.ref_fasta) tracker.merge_vcf_files(self.vcf_files) tracker.cluster(self.clustered_vcf_prefix, float("Inf"), max_alleles=5000) if not self.debug: os.unlink(f"{self.clustered_vcf_prefix}.excluded.tsv") utils.rm_rf(self.cluster_dir) logging.info("Finished clustering VCF file(s)") if not vcf_file_read.vcf_file_has_at_least_one_record( self.clustered_vcf): error_message = "No VCF records. Cannot continue. Please check that the input VCF files contained at least one variant" logging.error(error_message) raise Exception(error_message) if (self.total_splits is not None or self.variants_per_split is not None or self.alleles_per_split is not None or os.path.exists( os.path.join(self.split_input_dir, "data.pickle"))): self._run_gramtools_with_split_vcf() else: self._run_gramtools_not_split_vcf() logging.info("All done! Thank you for using minos :)")
def make_per_sample_vcfs_dir(sample_data_tsv, root_outdir, original_manifest=None, samples_per_dir=1000, cpus=1): vcf_root_out = os.path.join("VCFs") logs_root_out = os.path.join("Logs") if not os.path.exists(root_outdir): os.mkdir(root_outdir) # utils.rm_rf(f"{root_outdir}/*") utils.rm_rf(vcf_root_out) utils.rm_rf(logs_root_out) os.mkdir(os.path.join(root_outdir, vcf_root_out)) os.mkdir(os.path.join(root_outdir, logs_root_out)) sample_number = 0 tsv_out = os.path.join(root_outdir, "manifest.tsv") utils.rm_rf(tsv_out) json_out = os.path.join(root_outdir, "manifest.json") utils.rm_rf(json_out) data = {} parallel_jobs_data = [] with open(sample_data_tsv) as f_in, open(tsv_out, "w") as f_out: print("sample", "vcf_file", "log_file", sep="\t", file=f_out) for line in f_in: if sample_number % samples_per_dir == 0: outdir = str(sample_number // samples_per_dir) vcf_dir = os.path.join(vcf_root_out, outdir) vcf_dir_full = os.path.join(root_outdir, vcf_dir) os.mkdir(vcf_dir_full) log_dir = os.path.join(logs_root_out, outdir) log_dir_full = os.path.join(root_outdir, log_dir) os.mkdir(log_dir_full) sample_name, minos_indir = line.rstrip().split() vcf_in = os.path.join(minos_indir, "debug.calls_with_zero_cov_alleles.vcf") log_in = os.path.join(minos_indir, "log.txt") vcf_out = os.path.join(vcf_dir, f"{sample_number}.vcf.gz") vcf_out_full = os.path.join(root_outdir, vcf_out) log_out = os.path.join(log_dir, f"{sample_number}.log.gz") log_out_full = os.path.join(root_outdir, log_out) parallel_jobs_data.append((vcf_in, vcf_out_full)) parallel_jobs_data.append((log_in, log_out_full)) sample_number += 1 print(sample_name, vcf_out, log_out, sep="\t", file=f_out) data[sample_name] = {"vcf_file": vcf_out, "log_file": log_out} with multiprocessing.Pool(cpus) as pool: pool.map(compress_file, parallel_jobs_data) with open(json_out, "w") as f: json.dump(data, f, indent=2, sort_keys=True) if original_manifest is None: return expect_samples = manifest_to_set_of_sample_names(original_manifest) failed_samples = expect_samples.difference(data) if len(failed_samples) > 0: failed_samples = sorted(list(failed_samples)) with open(os.path.join(root_outdir, "failed_samples.txt"), "w") as f: print(*failed_samples, sep="\n", file=f)