예제 #1
0
    def test_can_build_with_one_seq(self):
        ref = ReferenceChromosome("TCATAAAAAAAT")
        sequence_bank = SequenceBank(ref)
        sequence_bank.add_sequence(".*G.........",
                                   "            ",
                                   n_fwd=2,
                                   n_rev=1)

        builder = BAMBuilder(
            os.path.join(self.work_dir,
                         self.filestub + ".bam")).with_bam_contig_data(
                             self.chrom, self.chrom_length, self.sample_name,
                             sequence_bank)
        builder.build()

        bam_file = pysam.Samfile(builder.filename, "rb")
        reads = list(bam_file.fetch())
        self.assertEqual(len(reads), 3)

        for read in reads:
            self.assertEqual(read.pos, 0)
            self.assertEqual(read.seq, "TGTAAAAAAAT")
            self.assertEqual(read.cigarstring, "1M1D10M")

        self.assertTrue(os.path.isfile(bam_file.filename))
        self.assertTrue(os.path.isfile(bam_file.filename.decode() + ".bai"))
예제 #2
0
 def test_should_be_able_to_add_snp_using_whitespace_dsl_syntax(self):
     # Given
     input_ref = "CC*AAGG"
     snp_input = "   .T. "
     # When
     sequence_bank = SequenceBank(ReferenceChromosome(input_ref))
     sequence_bank.add_sequence(snp_input)
     read_lists = [builder.build_reads(0, {}) for builder in sequence_bank]
     reads = [read for read_list in read_lists for read in read_list]
     # Then
     self.assertEqual(reads[0].pos, 2)
     self.assertEqual(reads[0].seq, 'ATG')
예제 #3
0
    def test_header_for_multisample_multicontig(self):
        ref = ReferenceChromosome("")
        sequence_bank = SequenceBank(ref)
        builder = BAMBuilder(
            os.path.join(self.work_dir, self.filestub + ".bam"))
        builder.with_bam_contig_data("1", 10, "SAMPLE_ONE", sequence_bank)
        builder.with_bam_contig_data("2", 20, "SAMPLE_TWO", sequence_bank)

        expected_header = {
            'HD': {
                'VN': '1.0'
            },
            'SQ': [{
                'LN': 10,
                'SN': "1"
            }, {
                'LN': 20,
                'SN': "2"
            }],
            'RG': [{
                "ID": RG_ID + "_SAMPLE_ONE",
                "SM": "SAMPLE_ONE"
            }, {
                "ID": RG_ID + "_SAMPLE_TWO",
                "SM": "SAMPLE_TWO"
            }]
        }

        self.assertDictEqual(expected_header, builder.header)
예제 #4
0
 def test_should_fail_at_seq_with_different_length_to_reference(self):
     # Given
     ref_seq = "AAAA"
     seq = "CC"
     sequence_bank = SequenceBank(ReferenceChromosome(ref_seq))
     # Then
     self.assertRaises(weCallException, sequence_bank.add_sequence, seq)
예제 #5
0
 def add_sample_name(self, sample_name):
     if sample_name in self.__samples:
         raise weCallException(
             "Sample {} already exists in the SampleBank.".format(
                 sample_name))
     sequence_bank = SequenceBank(self.reference)
     self.__samples[sample_name] = sequence_bank
     return sequence_bank
예제 #6
0
    def test_should_use_sample_name_if_available(self):
        chrom = '14'

        sequence_bank = SequenceBank(
            ReferenceChromosome('CGGCGGTCGAACGGAGCCCCAAGCGAAGCTCAAAACATGG', 0,
                                chrom))
        sequence_bank.add_sequence('      ...........A.............         ',
                                   n_fwd=10,
                                   n_rev=10)

        driver = SVCDriver(self).with_ref_sequence(
            'CGGCGGTCGAACGGAGCCCCAAGCGAAGCTCAAAACATGG', chrom=chrom)\
            .with_bam_data('pi.bam', {'sample': sequence_bank}, True)

        expect = driver.call()

        expect.with_output_vcf().record_count(1).with_samples(['sample'])
예제 #7
0
    def test_can_build_with_defined_quality(self):
        ref = ReferenceChromosome("TCATAAAT")
        sequence_bank = SequenceBank(ref)
        sequence_bank.add_sequence(".*G.....", "9 87  00", n_fwd=1, n_rev=0)

        builder = BAMBuilder(
            os.path.join(self.work_dir,
                         self.filestub + ".bam")).with_bam_contig_data(
                             self.chrom, self.chrom_length, self.sample_name,
                             sequence_bank)
        builder.build()

        bam_file = pysam.Samfile(builder.filename, "rb")
        reads = list(bam_file.fetch())
        self.assertEqual(len(reads), 1)
        self.assertEqual(reads[0].seq, "TGTAAAT")

        # ascii: "0": "!", "1": "+", "2": "5", "3": "?", "4": "H", "5": "S",
        # "6": "]", "7": "g", "8": "q", "9": "{"
        expected_qual = "{qgHH!!"
        self.assertEqual(reads[0].qual, expected_qual)
예제 #8
0
    def setParallelAndSerialVariantCallers(self, copies1, copies2):
        '''Prepare the variant caller data for the test to run'''
        filestem = "vc_input"

        ref_file_builder = FastaFileBuilder(os.path.join(self.work_dir, filestem + ".fa"))
        ref1 = ref_file_builder.with_chrom(self.chrom1, self.ref_string1 * copies1)
        ref2 = ref_file_builder.with_chrom(self.chrom2, self.ref_string2 * copies2)

        self.repeat_length1 = ref1.length_minus_deletions() / copies1
        self.repeat_length2 = ref2.length_minus_deletions() / copies2

        ref_file_builder.build()
        ref_file_builder.index()

        seq_bank1 = SequenceBank(ref1)
        seq_bank1.add_sequence(self.seq_string1 * copies1, n_fwd=10, n_rev=10)

        seq_bank2 = SequenceBank(ref2)
        seq_bank2.add_sequence(self.seq_string2 * copies2, n_fwd=10, n_rev=10)
        seq_bank2.add_sequence(self.seq_string3 * copies2, n_fwd=10, n_rev=10)

        bam_builder = BAMBuilder(os.path.join(self.work_dir, filestem + ".bam"))
        bam_builder.with_bam_contig_data(ref1.chrom, ref1.length_minus_deletions(), self.sample_name1, seq_bank1)
        bam_builder.with_bam_contig_data(ref2.chrom, ref2.length_minus_deletions(), self.sample_name2, seq_bank2)
        bam_builder.build()

        wecall_input_data = WecallInputData([bam_builder.filename], ref_file_builder.filename)
        wecall_config_builder = WecallConfigBuilder(wecall_input_data, os.path.join(self.work_dir, filestem))
        wecall_config_builder.with_configuration("maxBlockSize", self.block_size)
        wecall_config_builder.with_configuration("noSimilarReadsFilter", False)
        wecall_config_builder.with_configuration("maxClusterDist", 20)
        wecall_config = wecall_config_builder.build()

        parallel_output_file_stem = os.path.join(self.work_dir, filestem + "_parallel")
        serial_output_file_stem = os.path.join(self.work_dir, filestem + "_serial")

        self.vc_wrapper_parallel = VariantCallerWrapper(parallel_output_file_stem, wecall_config)

        self.vc_wrapper_serial = VariantCallerWrapper(serial_output_file_stem, wecall_config)
예제 #9
0
    def test_can_build_two_chroms(self):
        ref1 = ReferenceChromosome("TCATAAAAAAAT")
        sequence_bank1 = SequenceBank(ref1)
        sequence_bank1.add_sequence(".*G.........")

        ref2 = ReferenceChromosome("GGGG")
        sequence_bank2 = SequenceBank(ref2)
        sequence_bank2.add_sequence("..*.")

        builder = BAMBuilder(
            os.path.join(self.work_dir,
                         self.filestub + ".bam")).with_bam_contig_data(
                             "1", 100, "SAMPLE",
                             sequence_bank1).with_bam_contig_data(
                                 "X", 50, "SAMPLE", sequence_bank2)
        builder.build()

        bam_file = pysam.Samfile(builder.filename, "rb")
        reads_chrom1 = list(bam_file.fetch(region="1:1-20"))
        self.assertEqual(len(reads_chrom1), 1)
        self.assertEqual(reads_chrom1[0].seq, "TGTAAAAAAAT")

        bam_file = pysam.Samfile(builder.filename, "rb")
        reads_chrom2 = list(bam_file.fetch(region="X:1-5"))
        self.assertEqual(len(reads_chrom2), 1)
        self.assertEqual(reads_chrom2[0].seq, "GGG")

        reads = list(bam_file.fetch())
        self.assertEqual(len(reads), 2)
        self.assertEqual(reads[0].seq, "TGTAAAAAAAT")
        self.assertEqual(reads[1].seq, "GGG")

        self.assertRaises(ValueError, bam_file.fetch, region="2:1-20")

        self.assertTrue(os.path.isfile(bam_file.filename))
        self.assertTrue(os.path.isfile(bam_file.filename.decode() + ".bai"))