def test_consensus_fasta(self): with patch.object( Variant, "reference_sequence", return_value= "GGCTGCGGGGAGGGGGGCGCGGGTCCGCAGTGGGGCTGTGGGAGGGGTCCGCGCGTCCGCAGTGGGGATGTG", ) as mock_ref: record = next(vcf.Reader(self.vcf_file)) self.assertTrue(record.is_sv) variant = Variant.from_pyvcf(record, None) fasta_path, ref_contig, alt_contig = variant.synth_fasta( self.args, line_width=sys.maxsize) self.assertEqual(ref_contig, "1_899922_899993") self.assertEqual(alt_contig, "1_899922_899993_alt") mock_ref.assert_called_once_with(region="1:899922-899993") with open(fasta_path, "r") as fasta: lines = [line.strip() for line in fasta] self.assertEqual(len(lines), 4) self.assertEqual(lines[0], ">1_899922_899993") self.assertEqual( lines[1], "GGCTGCGGGGAGGGGGGCGCGGGTCCGCAGTGGGGCTGTGGGAGGGGTCCGCGCGTCCGCAGTGGGGATGTG", ) self.assertEqual(lines[2], ">1_899922_899993_alt") self.assertEqual(lines[3], "GG")
def test_consensus_fasta(self): with patch.object( Variant, "reference_sequence", return_value= "GAACCTGGGAGGCAGAGCTTGCAGTGAGCAGAGATCATGCCACTGCACTCCAGCCTGGGCGACAGAGCAAGACTCCGTCTCAAAAAAAAAAAAAAAATTAGCCAGGCGTGGTGGCGGGCGCCTGTAGTCCCAGCTACTCAGGAGGCTGAGGCAGGAGAATGGCATGA" ) as mock_ref: record = next(vcf.Reader(self.vcf_file)) self.assertTrue(record.is_sv) variant = Variant.from_pyvcf(record, None) self.assertIsNotNone(variant) fasta_path, ref_contig, alt_contig = variant.synth_fasta( self.args, line_width=sys.maxsize) mock_ref.assert_called_once_with(region="4:32197282-32197448") with open(fasta_path, "r") as fasta: lines = [line.strip() for line in fasta] self.assertEqual(len(lines), 4) self.assertEqual(lines[0], ">4_32197282_32197448") self.assertEqual(lines[0], f">{ref_contig}") self.assertEqual( lines[1], "GAACCTGGGAGGCAGAGCTTGCAGTGAGCAGAGATCATGCCACTGCACTCCAGCCTGGGCGACAGAGCAAGACTCCGTCTCAAAAAAAAAAAAAAAATTAGCCAGGCGTGGTGGCGGGCGCCTGTAGTCCCAGCTACTCAGGAGGCTGAGGCAGGAGAATGGCATGA", ) self.assertEqual(lines[2], ">4_32197282_32197448_alt") self.assertEqual(lines[2], f">{alt_contig}") self.assertEqual( lines[3], "GA", )
def test_consensus_fasta(self): with patch.object( Variant, "reference_sequence", return_value= "GTATATATATATAGATCTATATATCTATATATAGATCTATATATAGATATATATCTATATATATAGATATATAGATATATAGATCTATATATAGATATATATATCTATATATAGATCTATATATAGATATAGATATCTATATAGATATCTATATCTATATATATGTAGATATATAGATATAGATATCTATATATCTATATATATAGATATCTATAGATATATATCTATATAGATATATCTATATCTATATATAGATATATATCTATATATAGATATATATCTATATATAGATAGATATATATCTATATATAGATATATCTATATCTATATATAGATATATATCTATATATAGATATATCTATATATAGATATATATCTATAGATATATCTATATATATCGATATATCTATATATATCGATATATAT", ) as mock_ref: record = next(vcf.Reader(self.vcf_file)) self.assertTrue(record.is_sv) variant = Variant.from_pyvcf(record, None) fasta_path, ref_contig, alt_contig = variant.synth_fasta( self.args, line_width=sys.maxsize) mock_ref.assert_called_once_with(region="4:20473845-20474270") with open(fasta_path, "r") as fasta: lines = [line.strip() for line in fasta] self.assertEqual(len(lines), 4) self.assertEqual(lines[0], ">4_20473845_20474270") self.assertEqual(lines[0], f">{ref_contig}") self.assertEqual( lines[1], "GTATATATATATAGATCTATATATCTATATATAGATCTATATATAGATATATATCTATATATATAGATATATAGATATATAGATCTATATATAGATATATATATCTATATATAGATCTATATATAGATATAGATATCTATATAGATATCTATATCTATATATATGTAGATATATAGATATAGATATCTATATATCTATATATATAGATATCTATAGATATATATCTATATAGATATATCTATATCTATATATAGATATATATCTATATATAGATATATATCTATATATAGATAGATATATATCTATATATAGATATATCTATATCTATATATAGATATATATCTATATATAGATATATCTATATATAGATATATATCTATAGATATATCTATATATATCGATATATCTATATATATCGATATATAT", ) self.assertEqual(lines[2], ">4_20473845_20474270_alt") self.assertEqual(lines[2], f">{alt_contig}") self.assertEqual( lines[3], "GATATATATAGATATATCTATATATATCTATATAGATATATCTATATCTATATAGATATATCTATATATATATAGATATATCTATATCTATATAGATATATATCTATATATATATCTATATAGATATATCTATATAGATATAGATATATATCTATATATAGATATAGATATATCTATATAGATATATATCTATAGATATCTATATATATAGATATATAGATATCTATATCTATATT", )
def test_pipeline_straddle_counting(self): for record in vcf.Reader(self.vcf_file): self.assertTrue(record.is_sv) variant = Variant.from_pyvcf(record, None) input_bam = os.path.join(FILE_DIR, "1_2073761_2073846_DEL_2.bam") sample = Sample.from_npsv(os.path.join(FILE_DIR, "stats.json"), input_bam) fragments = npsva.RealignedFragments( self.input_fasta, sample.mean_insert_size, sample.std_insert_size, sample.insert_size_density().as_dict(), input_bam, ) fragments.gather_reads(variant.region_string(flank=self.args.flank)) self.assertEqual(fragments.size(), 254) left_breakpoint = variant.left_flank_region_string(left_flank=1, right_flank=1) right_breakpoint = variant.right_flank_region_string(left_flank=1, right_flank=1) pair_results = fragments.count_pipeline_straddlers( left_breakpoint, right_breakpoint, self.args.flank, -variant.event_length, 1.5, 10, ) self.assertAlmostEqual(pair_results["alt_weighted_count"], 13.496, places=1) self.assertAlmostEqual(pair_results["insert_lower"], 0.0, places=2) self.assertAlmostEqual(pair_results["insert_upper"] / pair_results["insert_count"], 0.166, places=2)
def test_realigned_read_counting(self): for record in vcf.Reader(self.vcf_file): self.assertTrue(record.is_sv) variant = Variant.from_pyvcf(record, None) input_bam = os.path.join(FILE_DIR, "1_2073761_2073846_DEL_2.bam") sample = Sample.from_npsv(os.path.join(FILE_DIR, "stats.json"), input_bam) fragments = npsva.RealignedFragments( self.input_fasta, sample.mean_insert_size, sample.std_insert_size, sample.insert_size_density().as_dict(), input_bam, ) fragments.gather_reads(variant.region_string(flank=self.args.flank)) self.assertEqual(fragments.size(), 254) ref_contig = "1_2073761_2073846_DEL" alt_contig = "1_2073761_2073846_DEL_alt" rl_breakpoint = f"{ref_contig}:{self.args.flank}-{self.args.flank+1}" al_breakpoint = f"{alt_contig}:{self.args.flank}-{self.args.flank+1}" ref_length = variant.ref_length rr_breakpoint = f"{ref_contig}:{self.args.flank + ref_length - 1}-{self.args.flank + ref_length}" counts, read_names = fragments.count_realigned_reads([(rl_breakpoint, rr_breakpoint, al_breakpoint, "")]) self.assertEqual(counts["al"], 18.0) self.assertEqual((counts["rl"] + counts["rr"]) / 2, 4.0) for bp in ("rl", "rr", "al", "rl"): self.assertEqual(len(read_names[bp]), counts[bp])
def test_complex_breakpoint_arguments(self, mock_fragments): vcf_file = io.StringIO( """##fileformat=VCFv4.1 ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> ##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles"> ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> ##ALT=<ID=DEL,Description="Deletion"> ##contig=<ID=1,length=249250621> #CHROM POS ID REF ALT QUAL FILTER INFO 1 100 . AACGGTT ACGT . PASS SVTYPE=DEL;END=106;SVLEN=-3 """ ) for record in vcf.Reader(vcf_file): variant = Variant.from_pyvcf(record, None) self.assertIsNotNone(variant) read_counts = {"rl": 0, "rr": 0, "al": 0, "ar": 0} mock_fragments.count_realigned_reads.return_value = (read_counts, []) count_realigned_reads(self.args, mock_fragments, variant) mock_fragments.count_realigned_reads.assert_called_once_with( [("ref:1-2","ref:7-8","alt:1-2","alt:4-5")], count_straddle=True, )
def test_count_realigned_alleles(self): for record in vcf.Reader(self.vcf_file): variant = Variant.from_pyvcf(record, None) fragments = npsva.RealignedFragments( self.input_fasta, self.sample.mean_insert_size, self.sample.std_insert_size, self.sample.insert_size_density().as_dict(), self.input_bam, ) fragments.gather_reads( variant.region_string(flank=self.args.flank)) self.assertGreater(fragments.size(), 0) hom_alt_ref, hom_alt_alt, _ = count_realigned_reads( self.args, fragments, variant, input_fasta=self.input_fasta, ref_contig="1_2073761_2073846_DEL", alt_contig="1_2073761_2073846_DEL_alt", ) self.assertTrue(hom_alt_ref, 4.0) self.assertTrue(hom_alt_alt, 18.0)
def test_consensus_fasta(self): with patch.object( Variant, "reference_sequence", return_value= "TCTCCATATGATGTCAGTGTCCTCCATATGATGTCAGTGTCCTCCATATGACATCAATATCCTCCATATGATATCAATATCCTCTGTATTGATATTGATATTGATATTTGGAGGATATCAATATCCTCCAAATGATGTCAGTGTCCTCCATATGATGTCAATGTCCTCCATATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCATATGATGTCAGTGTCCTCTGTATGAC", ) as mock_ref: record = next(vcf.Reader(self.vcf_file)) variant = Variant.from_pyvcf(record, None) self.assertIsNotNone(variant) fasta_path, ref_contig, alt_contig = variant.synth_fasta( self.args, line_width=sys.maxsize) mock_ref.assert_called_once_with(region="1:4999478-4999733") self.assertEqual(ref_contig, "1_4999478_4999733") self.assertEqual(alt_contig, "1_4999478_4999733_alt") with open(fasta_path, "r") as fasta: lines = [line.strip() for line in fasta] self.assertEqual(len(lines), 4) self.assertEqual(lines[0], ">1_4999478_4999733") self.assertEqual( lines[1], "TCTCCATATGATGTCAGTGTCCTCCATATGATGTCAGTGTCCTCCATATGACATCAATATCCTCCATATGATATCAATATCCTCTGTATTGATATTGATATTGATATTTGGAGGATATCAATATCCTCCAAATGATGTCAGTGTCCTCCATATGATGTCAATGTCCTCCATATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCATATGATGTCAGTGTCCTCTGTATGAC" ) self.assertEqual(lines[2], ">1_4999478_4999733_alt") self.assertEqual( lines[3], "TCTCCATATGATGTCAGTGTCCTCCATATGATGTCAGTGTCCTCCATATGACATCAATATCCTCCATATGATATCAATATCCTCTGTATTGATATTGATATTGATATTTGGAGGATATCAATATCCTCCAAATGATGTCAGTGTCCTCCATATGATGTCAATGTCCTCCATATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCATATGATGTCAGTGTCCTCTGTATGACTCCATATGATGTCAGTGTCCTCCATATGATGTCAGTGTCCTCCATATGACATCAATATCCTCCATATGATATCAATATCCTCTGTATTGATATTGATATTGATATTTGGAGGATATCAATATCCTCCAAATGATGTCAGTGTCCTCCATATGATGTCAATGTCCTCCATATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCATATGATGTCAGTGTCCTCTGTATGAC" )
def test_region_strings(self): record = next(vcf.Reader(self.vcf_file)) variant = Variant.from_pyvcf(record, None) self.assertEqual(variant.region_string(), "1:899923-899992") self.assertEqual( variant.left_flank_region_string(left_flank=2, right_flank=5), "1:899921-899927") self.assertEqual( variant.right_flank_region_string(left_flank=2, right_flank=5), "1:899991-899997")
def gnomad_coverage_profile(args, gnomad_coverage: str, input_vcf: str, output_file): record = next(vcf.Reader(filename=input_vcf)) variant = Variant.from_pyvcf(record) variant.gnomad_coverage_profile( args, gnomad_coverage, output_file, ref_contig=args.ref_contig, alt_contig=args.alt_contig, )
def test_feature_extraction(self): for record in vcf.Reader(self.vcf_file): self.assertTrue(record.is_sv) variant = Variant.from_pyvcf(record, None) features = extract_features( self.args, variant, self.input_bam, self.sample, input_fasta=self.input_fasta, ref_contig="1_2073761_2073846_DEL", alt_contig="1_2073761_2073846_DEL_alt", )
def test_breakpoints(self): record = next(vcf.Reader(self.vcf_file)) variant = Variant.from_pyvcf(record, None) self.assertEqual( variant.left_flank_region_string(left_flank=1, right_flank=1), "4:32197282-32197283") self.assertEqual( variant.right_flank_region_string(left_flank=1, right_flank=1), "4:32197447-32197448") self.assertEqual(variant.ref_breakpoints(self.args.flank), ("4:1-2", "4:166-167")) self.assertEqual(variant.alt_breakpoints(self.args.flank), ("4:1-2", None))
def test_breakpoints(self): record = next(vcf.Reader(self.vcf_file)) variant = Variant.from_pyvcf(record, None) self.assertEqual( variant.left_flank_region_string(left_flank=1, right_flank=1), "4:20473845-20473846") self.assertEqual( variant.right_flank_region_string(left_flank=1, right_flank=1), "4:20474269-20474270") self.assertEqual(variant.ref_breakpoints(self.args.flank), ("4:1-2", "4:425-426")) self.assertEqual(variant.alt_breakpoints(self.args.flank), ("4:1-2", "4:235-236"))
def test_breakpoints(self): record = next(vcf.Reader(self.vcf_file)) variant = Variant.from_pyvcf(record, None) self.assertEqual( variant.left_flank_region_string(left_flank=1, right_flank=1), "8:79683397-79683398") self.assertEqual( variant.right_flank_region_string(left_flank=1, right_flank=1), "8:79683487-79683488") self.assertEqual(variant.ref_breakpoints(self.args.flank), ("8:1-2", "8:91-92")) self.assertEqual(variant.alt_breakpoints(self.args.flank), ("8:1-2", "8:2-3"))
def test_breakpoints(self): record = next(vcf.Reader(self.vcf_file)) variant = Variant.from_pyvcf(record, None) self.assertIsNotNone(variant) self.assertEqual( variant.left_flank_region_string(left_flank=1, right_flank=1), "1:899922-899923") self.assertEqual( variant.right_flank_region_string(left_flank=1, right_flank=1), "1:899992-899993") self.assertEqual(variant.ref_breakpoints(self.args.flank), ("1:1-2", "1:71-72")) self.assertEqual(variant.alt_breakpoints(self.args.flank), ("1:1-2", None))
def test_breakpoints(self): record = next(vcf.Reader(self.vcf_file)) variant = Variant.from_pyvcf(record, None) self.assertIsNotNone(variant) self.assertEqual( variant.left_flank_region_string(left_flank=1, right_flank=1), "1:4999478-4999479") self.assertEqual( variant.right_flank_region_string(left_flank=1, right_flank=1), "1:4999732-4999733") self.assertEqual(variant.ref_breakpoints(self.args.flank), ("1:1-2", "1:255-256")) self.assertEqual(variant.alt_breakpoints(self.args.flank), ("1:255-256", None))
def test_read_gather_call(self): for record in vcf.Reader(self.vcf_file): with patch.object(npsva.RealignedFragments, "gather_reads", return_value=0) as mock_gather: variant = Variant.from_pyvcf(record, None) extract_features( self.args, variant, self.input_bam, self.sample, input_fasta=self.input_fasta, ref_contig="1_2073761_2073846_DEL", alt_contig="1_2073761_2073846_DEL_alt", ) # Since this is a small variant, there should only one gather reads call mock_gather.assert_called_once()
def test_variant_properties(self): record = next(vcf.Reader(self.vcf_file)) self.assertTrue(record.is_sv) variant = Variant.from_pyvcf(record, None) self.assertIsNotNone(variant) self.assertTrue(variant.is_deletion) self.assertEqual( variant.ref_length, len("TATATATATATAGATCTATATATCTATATATAGATCTATATATAGATATATATCTATATATATAGATATATAGATATATAGATCTATATATAGATATATATATCTATATATAGATCTATATATAGATATAGATATCTATATAGATATCTATATCTATATATATGTAGATATATAGATATAGATATCTATATATCTATATATATAGATATCTATAGATATATATCTATATAGATATATCTATATCTATATATAGATATATATCTATATATAGATATATATCTATATATAGATAGATATATATCTATATATAGATATATCTATATCTATATATAGATATATATCTATATATAGATATATCTATATATAGATATATATCTATAGATATATCTATATATATCGATATATCTATATATATCGATATATA" )) self.assertEqual( variant.alt_length, len("ATATATATAGATATATCTATATATATCTATATAGATATATCTATATCTATATAGATATATCTATATATATATAGATATATCTATATCTATATAGATATATATCTATATATATATCTATATAGATATATCTATATAGATATAGATATATATCTATATATAGATATAGATATATCTATATAGATATATATCTATAGATATCTATATATATAGATATATAGATATCTATATCTATAT" ), )
def test_ci_for_precise_variants(self): vcf_file = io.StringIO("""##fileformat=VCFv4.1 ##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants"> ##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants"> ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> ##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles"> ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> ##ALT=<ID=DEL,Description="Deletion"> ##contig=<ID=1,length=249250621> #CHROM POS ID REF ALT QUAL FILTER INFO 1 2827694 rs2376870 CGTGGATGCGGGGAC C . PASS SVTYPE=DEL;END=2827708;SVLEN=-14 """) for record in vcf.Reader(vcf_file): self.assertTrue(record.is_sv) variant = Variant.from_pyvcf(record, None) self.assertTrue(variant.is_precise) self.assertEqual(variant.get_ci("CIPOS", 10), [0, 0])
def test_manta_vcf(self): vcf_file = io.StringIO("""##fileformat=VCFv4.1 ##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants"> ##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants"> ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> ##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles"> ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> ##INFO=<ID=SVINSLEN,Number=.,Type=Integer,Description="Length of insertion"> ##INFO=<ID=SVINSSEQ,Number=.,Type=String,Description="Sequence of insertion"> ##ALT=<ID=INS,Description="Insertion"> ##contig=<ID=3,length=198022430> #CHROM POS ID REF ALT QUAL FILTER INFO 3 72386664 MantaINS:1:5470:5470:0:0:0 G <INS> 999 MaxDepth END=72386664;SVTYPE=INS;SVLEN=10;CIPOS=0,9;CIEND=0,9;SVINSLEN=10;SVINSSEQ=GTGTGTGTGC """) record = next(vcf.Reader(vcf_file)) variant = Variant.from_pyvcf(record, None) self.assertEqual(variant._alt_seq(flank=1, ref_seq="GT"), "GGTGTGTGTGCT")
def test_ci_for_imprecise_variants(self): vcf_file = io.StringIO("""##fileformat=VCFv4.1 ##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants"> ##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants"> ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> ##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles"> ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> ##ALT=<ID=DEL,Description="Deletion"> ##contig=<ID=2,length=243199373> #CHROM POS ID REF ALT QUAL FILTER INFO 2 321682 . T <DEL> . PASS SVTYPE=DEL;END=321887;SVLEN=-205;CIPOS=-56,20 """) for record in vcf.Reader(vcf_file): self.assertTrue(record.is_sv) variant = Variant.from_pyvcf(record, None) self.assertFalse(variant.is_precise) self.assertEqual(variant.get_ci("CIPOS", 10), [-56, 20]) self.assertEqual(variant.get_ci("CIEND", 10), [-10, 10])
def test_gnomad_coverage_profile(self): record = next(vcf.Reader(self.vcf_file)) self.assertTrue(record.is_sv) variant = Variant.from_pyvcf(record, None) covg_path, ref_contig, alt_contig = variant.gnomad_coverage_profile( self.args, os.path.join( FILE_DIR, "1_896922_903086.gnomad.genomes.coverage.summary.tsv.gz"), line_width=sys.maxsize, ) self.assertEqual(ref_contig, "1_899922_899993") self.assertEqual(alt_contig, "1_899922_899993_alt") with open(covg_path, "r") as fasta: lines = [line.strip() for line in fasta] self.assertEqual(len(lines), 2) self.assertEqual(lines[1], "1_899922_899993_alt\t=1")
def random_variants( variant_or_vcf_path, ref_path, genome_path, gap_path, output_file, n=1, use_X=False, only_sex=False, flank=0, ): contigs = load_contigs(genome_path, use_X=use_X, only_sex=only_sex) gaps = pysam.TabixFile(gap_path) # pylint: disable=no-member ref_reader = pysam.FastaFile(ref_path) # pylint: disable=no-member write_header(contigs, output_file) if isinstance(variant_or_vcf_path, Variant): random_variant(variant_or_vcf_path, ref_reader, contigs, gaps, output_file, n=n, flank=flank) else: for record in vcf.Reader(filename=variant_or_vcf_path): variant = Variant.from_pyvcf(record) if variant is not None: random_variant(variant, ref_reader, contigs, gaps, output_file, n=n, flank=flank) ref_reader.close() gaps.close()
def main(): parser = make_argument_parser() args = parser.parse_args() logging.basicConfig(level=args.loglevel) # Create any directories that are needed logging.info( f"Creating {args.output} output and {args.tempdir} temporary directories if they don't exist" ) os.makedirs(args.output, exist_ok=True) os.makedirs(args.tempdir, exist_ok=True) # Initialize parallel computing setup ray.init(num_cpus=args.threads, _temp_dir=args.tempdir, include_dashboard=False) # TODO: If library is not specified compute statistics, i.e. mean insert size, tec. if args.stats_path is not None: logging.info("Extracting BAM stats from NPSV stats file") sample = Sample.from_npsv(args.stats_path, bam_path=args.bam, ped_path=args.ped_path) elif None not in ( args.fragment_mean, args.fragment_sd, args.read_length, args.depth, ): logging.info("Using Normal distribution for BAM stats") sample = Sample.from_distribution( args.bam, args.fragment_mean, args.fragment_sd, args.read_length, mean_coverage=args.depth, ) else: raise parser.error( "Library information needed. Either provide distribution parameters or run `npsvg preprocess` to generate stats file." ) # Select directory for variant files if args.keep_synth_bams: variant_dir = args.output else: variant_dir = args.tempdir # For each variant generate synthetic bam file(s) and extract relevant evidence observed_variants = {} record_results = [] vcf_reader = vcf.Reader(filename=args.input) for i, record in enumerate(tqdm(vcf_reader, desc="Preparing variants")): variant = Variant.from_pyvcf(record, args.reference) # npsv currently only supports deletions if variant is None: continue # NPSV currently does not support variants with duplicate start and end coordinates description = variant_descriptor(record) if observed_variants.setdefault(description, i) != i: logging.warning("Skipping variant with duplicate description %s", description) continue # Construct single variant VCF outside of worker so we don't need to pass the reader into the thread variant_vcf_path = os.path.join(variant_dir, description + ".vcf") if not args.reuse or not os.path.exists(variant_vcf_path + ".gz"): variant_vcf_path = write_record_to_indexed_vcf( record, vcf_reader, variant_vcf_path) else: # Variant file already exists, no need to recreate variant_vcf_path += ".gz" record_results.append( simulate_and_extract.remote(args, sample, variant, variant_vcf_path, description)) # Concatenate output files to create feature files sim_tsv_path = os.path.join(args.output, args.prefix + ".sim.tsv") real_tsv_path = os.path.join(args.output, args.prefix + ".real.tsv") logging.info("Extracting features (to %s and %s)", sim_tsv_path, real_tsv_path) with open(sim_tsv_path, "w") as file: Features.header(out_file=file, ac=True) with open(real_tsv_path, "w") as file: Features.header(out_file=file, ac=False) with open(sim_tsv_path, "ab") as sim_sink, open(real_tsv_path, "ab") as real_sink: for sim_result, real_result in tqdm( ray_iterator(record_results), total=len(record_results), desc="Extracting features", ): with open(sim_result, "rb") as source: shutil.copyfileobj(source, sim_sink) sim_sink.flush() with open(real_result, "rb") as source: shutil.copyfileobj(source, real_sink) real_sink.flush() # Perform genotyping with open(os.path.join(args.output, args.prefix + ".npsv.vcf"), "w") as gt_vcf_file: logging.info("Determining genotypes (output in %s)", gt_vcf_file.name) genotyping_args = argparse.Namespace(**vars(args)) genotype_vcf( genotyping_args, args.input, sim_tsv_path, real_tsv_path, gt_vcf_file, samples=[sample.name], )
def test_region_string(self): record = next(vcf.Reader(self.vcf_file)) variant = Variant.from_pyvcf(record, None) self.assertIsNotNone(variant) self.assertEqual(variant.region_string(flank=1), "1:4999478-4999733")
def test_region_string(self): record = next(vcf.Reader(self.vcf_file)) variant = Variant.from_pyvcf(record, None) with self.assertRaises(ValueError): variant.region_string() self.assertEqual(variant.region_string(flank=1), "1:931634-931635")
def test_event_length(self): record = next(vcf.Reader(self.vcf_file)) variant = Variant.from_pyvcf(record, None) self.assertEqual(variant.event_length, 70)