예제 #1
0
    def test_duplicate_position(self):
        reference_position = Position(call='G', coverage='-', proportion='-')

        dups_position = Position(call='1', coverage='-', proportion='-')

        samples = ((Position(call='G', coverage='-', proportion='-'), ), )
        self.analysis.analyze_position(reference_position, dups_position,
                                       samples)
예제 #2
0
    def test_varscan_call_cannot_be_made(self):
        """
        VarScan may include a position with ALT values when a call cannot be made.
        It should still be called missing (X).

        TODO: Add See Also VarScan documentation
        """

        # The following is from a SRR011186 sample using bwamem and varscan.
        # The positions from the source data were 34072-34074.
        vcf_data = (
            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SRR011186\n"
            "gi|561108321|ref|NC_018143.2|	1	.	GC	C	.	PASS	ADP=114;WT=0;HET=0;HOM=1;NC=0	GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR	1/1:255:114:114:6:108:94.74%:1.6043E-58:40:38:2:4:46:62\n"
            # This position should be called missing because the GT column is './.'
            "gi|561108321|ref|NC_018143.2|	2	.	C	G	.	PASS	ADP=108;WT=1;HET=0;HOM=0;NC=0	GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR	./.:.:108\n"
            "gi|561108321|ref|NC_018143.2|	3	.	A	.	.	PASS	ADP=112;WT=1;HET=0;HOM=0;NC=0	GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR	0/0:209:112:112:111:0:0%:1E0:38:0:47:64:0:0\n"
        )

        expected = (Position(call='G',
                             simple_call='G',
                             coverage=114,
                             proportion=0.05263157894736842),
                    Position(call='X',
                             simple_call='N',
                             coverage=108.0,
                             proportion='-'),
                    Position(call='A',
                             simple_call='A',
                             coverage=112,
                             proportion=0.9910714285714286))

        with tempfile.NamedTemporaryFile('w+') as tmpfile:
            # Seed the file with test data
            tmpfile.write(vcf_data)
            tmpfile.seek(0)

            # Find the test contig.
            vcf = Vcf(tmpfile.name, 'SRR011186', 'varscan', 'bwamem')
            contig = vcf.get_contig('gi|561108321|ref|NC_018143.2|')
            positions = contig.positions
            self.assertIsInstance(contig, VcfContig)

            # Check position values.
            position = 0
            for expect, observe in zip(expected, positions):
                position += 1
                self.assertEqual(expect, observe)

            # It yields all expected positions
            self.assertEqual(position, len(expected))

            # All following positions should be empty
            self.assertEqual(VcfContig.VCF_EMPTY_POSITION, next(positions))
예제 #3
0
    def test_sample_vcfs_return_infinite_positions(self):
        vcf = Vcf(testdata.GATK_VCF, 'test_name', 'test_aliner',
                  'test_snpcaller')
        contig = vcf.get_contig('500WT1_test')
        positions = contig.positions

        expected = (
            Position(call='C', simple_call='C', coverage=19049,
                     proportion='-'),
            Position(call='C', simple_call='C', coverage=19049,
                     proportion='-'),
            Position(call='T', simple_call='T', coverage=18824,
                     proportion='-'),
            Position(call='G', simple_call='G', coverage=18804,
                     proportion='-'),
            Position(call='X', simple_call='N', coverage='?', proportion='?'),
            Position(call='X', simple_call='N', coverage='?', proportion='?'),
            Position(call='G', simple_call='G', coverage=18895,
                     proportion='-'),
            Position(call='A', simple_call='A', coverage=19005,
                     proportion='-'),
        )

        # It should yield all the contig positions.
        position = 0
        for expect, observe in zip(expected, positions):
            position += 1
            self.assertEqual(expect, observe)
        self.assertEqual(len(expected), position)

        # It should yield empty positions after the contig is exhausted.
        self.assertEqual(VcfContig.VCF_EMPTY_POSITION, next(positions))
        self.assertEqual(VcfContig.VCF_EMPTY_POSITION, next(positions))
예제 #4
0
    def test_call_normalizes_to_uppercase(self):
        observed = ['g', 'a', 't', 'c', 'd', 'x', '.']
        expected = ['G', 'A', 'T', 'C', 'D', 'X', '.']

        for observe, expect in zip(observed, expected):
            pos = Position(call=observe, coverage='-', proportion='-')
            self.assertEqual(expect, pos.call)
예제 #5
0
    def test_simple_call_normalizes_to_uppercase_and_masks_degeneracies_with_N(
            self):
        observed = ['g', 'a', 't', 'c', 'd', 'x', '.']
        expected = ['G', 'A', 'T', 'C', 'N', 'N', 'N']

        for observe, expect in zip(observed, expected):
            pos = Position(call=observe, coverage='-', proportion='-')
            self.assertEqual(expect, pos.simple_call)
예제 #6
0
    def test_unique_position(self):
        reference_position = Position(call='G', coverage='-', proportion='-')

        dups_position = Position(
            # Unique position
            call='0',
            coverage='-',
            proportion='-')

        samples = (
            (
                # Single Nucleotide Monomorphism
                Position(call='G', coverage='-', proportion='-'),
                # Single Nucleotide Polymorphism
                Position(call='A', coverage='-', proportion='-'),
            ), )
        self.analysis.analyze_position(reference_position, dups_position,
                                       samples)
예제 #7
0
    def test_no_duplicate_information_position(self):
        """
        Scenario: The reference was not scanned for duplicate positions.
        As a result, it is assumed all positions passed.

        The following fields should be affected:
        - is_reference_duplicated is False
        - called_snp is incremented for snps, but not for monomorphisms
        - is_missing_matrix is True
        - is_all_quality_breadth is True
        """

        reference_position = Position(call='G', coverage='-', proportion='-')

        dups_position = Position(
            # No duplicate information
            call='X',
            coverage='-',
            proportion='-')

        samples = (
            (
                # Single Nucleotide Monomorphism
                Position(call='G', coverage='-', proportion='-'),
                # Single Nucleotide Polymorphism
                Position(call='A', coverage='-', proportion='-'),
            ), )

        expected = PositionInfo(is_all_called=True,
                                is_reference_clean=True,
                                is_reference_duplicated=False,
                                is_all_passed_coverage=True,
                                is_all_passed_proportion=True,
                                is_all_passed_consensus=True,
                                is_all_quality_breadth=True,
                                is_best_snp=False,
                                all_sample_stats=[
                                    (Counter({
                                        'quality_breadth': 1,
                                        'was_called': 1,
                                        'called_reference': 1,
                                        'passed_proportion_filter': 1,
                                        'passed_coverage_filter': 1,
                                        'called_snp': 0,
                                        'called_degen': 0
                                    }),
                                     Counter({
                                         'quality_breadth': 1,
                                         'was_called': 1,
                                         'called_reference': 1,
                                         'passed_proportion_filter': 1,
                                         'passed_coverage_filter': 1,
                                         'called_snp': 0,
                                         'called_degen': 0
                                     })),
                                    [
                                        Counter({
                                            'quality_breadth': 1,
                                            'was_called': 1,
                                            'called_reference': 1,
                                            'passed_proportion_filter': 1,
                                            'passed_coverage_filter': 1,
                                            'called_snp': 0,
                                            'called_degen': 0
                                        }),
                                        Counter({
                                            'quality_breadth': 1,
                                            'was_called': 1,
                                            'called_reference': 1,
                                            'passed_proportion_filter': 1,
                                            'passed_coverage_filter': 1,
                                            'called_snp': 0,
                                            'called_degen': 0
                                        }),
                                        Counter({
                                            'quality_breadth': 1,
                                            'was_called': 1,
                                            'called_reference': 1,
                                            'passed_proportion_filter': 1,
                                            'passed_coverage_filter': 1,
                                            'called_snp': 0,
                                            'called_degen': 0
                                        })
                                    ]
                                ],
                                is_missing_matrix=False,
                                called_reference=1,
                                called_snp=0,
                                passed_coverage_filter=1,
                                passed_proportion_filter=1,
                                num_A=0,
                                num_C=0,
                                num_G=1,
                                num_T=0,
                                num_N=0,
                                call_str=['G', 'G'],
                                masked_call_str=['G', 'G'],
                                CallWasMade='Y',
                                PassedDepthFilter='-',
                                PassedProportionFilter='-',
                                Pattern=['1', '1'])

        expected = PositionInfo(
            is_all_called=True,
            is_reference_clean=True,
            is_reference_duplicated=False,
            is_all_passed_coverage=True,
            is_all_passed_proportion=True,
            is_all_passed_consensus=False,
            is_all_quality_breadth=False,
            is_best_snp=False,
            all_sample_stats=[[
                Counter({
                    'quality_breadth': 1,
                    'called_reference': 1,
                    'called_snp': 1,
                    'passed_coverage_filter': 1,
                    'passed_proportion_filter': 1,
                    'was_called': 1,
                    'called_degen': 0
                }),
                Counter({
                    'quality_breadth': 1,
                    'passed_coverage_filter': 1,
                    'passed_proportion_filter': 1,
                    'was_called': 1,
                    'called_reference': 0,
                    'called_snp': 0,
                    'called_degen': 0
                })
            ],
                              [
                                  Counter({
                                      'quality_breadth': 1,
                                      'called_reference': 1,
                                      'called_snp': 1,
                                      'passed_coverage_filter': 1,
                                      'passed_proportion_filter': 1,
                                      'was_called': 1,
                                      'called_degen': 0
                                  }),
                                  Counter({
                                      'quality_breadth': 1,
                                      'passed_coverage_filter': 1,
                                      'passed_proportion_filter': 1,
                                      'was_called': 1,
                                      'called_reference': 0,
                                      'called_snp': 0,
                                      'called_degen': 0
                                  }),
                                  Counter({
                                      'quality_breadth': 1,
                                      'called_reference': 1,
                                      'passed_coverage_filter': 1,
                                      'passed_proportion_filter': 1,
                                      'was_called': 1,
                                      'called_snp': 0,
                                      'called_degen': 0
                                  }),
                                  Counter({
                                      'quality_breadth': 1,
                                      'called_snp': 1,
                                      'passed_coverage_filter': 1,
                                      'passed_proportion_filter': 1,
                                      'was_called': 1,
                                      'called_reference': 0,
                                      'called_degen': 0
                                  })
                              ]],
            is_missing_matrix=True,
            called_reference=1,
            called_snp=1,
            passed_coverage_filter=2,
            passed_proportion_filter=2,
            num_A=1,
            num_C=0,
            num_G=1,
            num_T=0,
            num_N=0,
            call_str=['G', 'G', 'A'],
            masked_call_str=['G', 'G', 'A'],
            CallWasMade='YY',
            PassedDepthFilter='--',
            PassedProportionFilter='--',
            Pattern=['1', '1', '2'])
        self.assertEqual(
            expected,
            self.analysis.analyze_position(reference_position, dups_position,
                                           samples))
예제 #8
0
    def test_fasta_position(self):
        reference_position = Position(call='G', coverage='-', proportion='-')

        dups_position = Position(call='0', coverage='-', proportion='-')

        samples = ((Position(call='G', coverage='-', proportion='-'),
                    Position(call='G', coverage='-', proportion='-')), )

        expected = PositionInfo(
            is_all_called=True,
            is_reference_clean=True,
            is_reference_duplicated=False,
            is_all_passed_coverage=True,
            is_all_passed_proportion=True,
            is_all_passed_consensus=True,
            is_all_quality_breadth=True,
            is_best_snp=False,
            all_sample_stats=[[
                Counter({
                    'called_reference': 1,
                    'passed_coverage_filter': 1,
                    'quality_breadth': 1,
                    'was_called': 1,
                    'passed_proportion_filter': 1,
                    'called_snp': 0,
                    'called_degen': 0
                }),
                Counter({
                    'called_reference': 1,
                    'passed_coverage_filter': 1,
                    'quality_breadth': 1,
                    'was_called': 1,
                    'passed_proportion_filter': 1,
                    'called_snp': 0,
                    'called_degen': 0
                })
            ],
                              [
                                  Counter({
                                      'called_reference': 1,
                                      'passed_coverage_filter': 1,
                                      'quality_breadth': 1,
                                      'was_called': 1,
                                      'passed_proportion_filter': 1,
                                      'called_snp': 0,
                                      'called_degen': 0
                                  }),
                                  Counter({
                                      'called_reference': 1,
                                      'passed_coverage_filter': 1,
                                      'quality_breadth': 1,
                                      'was_called': 1,
                                      'passed_proportion_filter': 1,
                                      'called_snp': 0,
                                      'called_degen': 0
                                  }),
                                  Counter({
                                      'called_reference': 1,
                                      'passed_coverage_filter': 1,
                                      'quality_breadth': 1,
                                      'was_called': 1,
                                      'passed_proportion_filter': 1,
                                      'called_snp': 0,
                                      'called_degen': 0
                                  }),
                                  Counter({
                                      'called_reference': 1,
                                      'passed_coverage_filter': 1,
                                      'quality_breadth': 1,
                                      'was_called': 1,
                                      'passed_proportion_filter': 1,
                                      'called_snp': 0,
                                      'called_degen': 0
                                  })
                              ]],
            is_missing_matrix=False,
            called_reference=2,
            called_snp=0,
            passed_coverage_filter=2,
            passed_proportion_filter=2,
            num_A=0,
            num_C=0,
            num_G=2,
            num_T=0,
            num_N=0,
            call_str=['G', 'G', 'G'],
            masked_call_str=['G', 'G', 'G'],
            CallWasMade='YY',
            PassedDepthFilter='--',
            PassedProportionFilter='--',
            Pattern=['1', '1', '1'])

        self.assertEqual(
            expected,
            self.analysis.analyze_position(reference_position, dups_position,
                                           samples))
예제 #9
0
    def test_duplicate_information_position(self):
        """
        Scenario: The reference was scanned for duplicate positions.
        Positions marked as duplicates are not quality positions and should not increment the sample statistics.

        - is_reference_duplicated is True
        - is_missing_matrix is False
        - is_all_quality_breadth is False
        - is_all_passed_consensus is False
        - None of the sample stats are incremented
        """

        reference_position = Position(call='G', coverage='-', proportion='-')

        dups_position = Position(
            # No duplicate information
            call='1',
            coverage='-',
            proportion='-')

        samples = (
            (
                # Single Nucleotide Monomorphism
                Position(call='G', coverage='-', proportion='-'),
                # Single Nucleotide Polymorphism
                Position(call='A', coverage='-', proportion='-'),
            ), )

        expected = PositionInfo(
            is_all_called=True,
            is_reference_clean=True,
            is_reference_duplicated=True,
            is_all_passed_coverage=True,
            is_all_passed_proportion=True,
            is_all_passed_consensus=False,
            is_all_quality_breadth=False,
            is_best_snp=False,
            all_sample_stats=[[
                Counter({
                    'passed_coverage_filter': 1,
                    'was_called': 1,
                    'passed_proportion_filter': 1,
                    'called_reference': 0,
                    'quality_breadth': 0,
                    'called_snp': 0,
                    'called_degen': 0
                }),
                Counter({
                    'passed_coverage_filter': 1,
                    'was_called': 1,
                    'passed_proportion_filter': 1,
                    'called_reference': 0,
                    'quality_breadth': 0,
                    'called_snp': 0,
                    'called_degen': 0
                })
            ],
                              [
                                  Counter({
                                      'passed_coverage_filter': 1,
                                      'was_called': 1,
                                      'passed_proportion_filter': 1,
                                      'called_reference': 0,
                                      'quality_breadth': 0,
                                      'called_snp': 0,
                                      'called_degen': 0
                                  }),
                                  Counter({
                                      'passed_coverage_filter': 1,
                                      'was_called': 1,
                                      'passed_proportion_filter': 1,
                                      'called_reference': 0,
                                      'quality_breadth': 0,
                                      'called_snp': 0,
                                      'called_degen': 0
                                  }),
                                  Counter({
                                      'passed_coverage_filter': 1,
                                      'was_called': 1,
                                      'passed_proportion_filter': 1,
                                      'called_reference': 0,
                                      'quality_breadth': 0,
                                      'called_snp': 0,
                                      'called_degen': 0
                                  }),
                                  Counter({
                                      'passed_coverage_filter': 1,
                                      'was_called': 1,
                                      'passed_proportion_filter': 1,
                                      'called_reference': 0,
                                      'quality_breadth': 0,
                                      'called_snp': 0,
                                      'called_degen': 0
                                  })
                              ]],
            is_missing_matrix=False,
            called_reference=1,
            called_snp=0,
            passed_coverage_filter=2,
            passed_proportion_filter=2,
            num_A=1,
            num_C=0,
            num_G=1,
            num_T=0,
            num_N=0,
            call_str=['G', 'G', 'A'],
            masked_call_str=['G', 'G', 'A'],
            CallWasMade='YY',
            PassedDepthFilter='--',
            PassedProportionFilter='--',
            Pattern=['1', '1', '2'])
        self.assertEqual(
            expected,
            self.analysis.analyze_position(reference_position, dups_position,
                                           samples))
예제 #10
0
    def setUp(self):
        # Source fasta values
        contigs = (
            # No gap
            (">contig0\n"
             "GATC\n"
             "GGAA\n"),
            # Gap after contig
            (
                ">contig1\n"
                "GATC\n"
                "GGAA\n"
                "\n"
                # ),
                # # No linebreak
                # (
                # ">contig2\n"
                #     "GATCGGAA"
            )
            # TODO: >80 characters contig?
        )

        # Expected values
        self.contigs_expected = ({
            'name':
            'contig0',
            'file_position':
            0 + len('>contig0\n'),
            'positions': (Position(call='G', coverage='-', proportion='-'),
                          Position(call='A', coverage='-', proportion='-'),
                          Position(call='T', coverage='-', proportion='-'),
                          Position(call='C', coverage='-', proportion='-'),
                          Position(call='G', coverage='-', proportion='-'),
                          Position(call='G', coverage='-', proportion='-'),
                          Position(call='A', coverage='-', proportion='-'),
                          Position(call='A', coverage='-', proportion='-'))
        }, {
            'name':
            'contig1',
            'file_position':
            len(contigs[0]) + len('>contig1\n'),
            'positions': (Position(call='G', coverage='-', proportion='-'),
                          Position(call='A', coverage='-', proportion='-'),
                          Position(call='T', coverage='-', proportion='-'),
                          Position(call='C', coverage='-', proportion='-'),
                          Position(call='G', coverage='-', proportion='-'),
                          Position(call='G', coverage='-', proportion='-'),
                          Position(call='A', coverage='-', proportion='-'),
                          Position(call='A', coverage='-', proportion='-'))
        })

        fasta_content = "".join(contigs)

        # Create a mock fasta
        self.fasta_file = tempfile.NamedTemporaryFile(mode='w+', delete=False)
        self.fasta_file.write(fasta_content)
        self.fasta_file.seek(0)

        # Instantiate test contigs
        self.contig0 = FastaContig(self.contigs_expected[0]['name'],
                                   len(self.contigs_expected[0]['positions']),
                                   self.contigs_expected[0]['file_position'],
                                   self.fasta_file.name,
                                   is_reference=False)
        self.contig1 = FastaContig(self.contigs_expected[1]['name'],
                                   len(self.contigs_expected[1]['positions']),
                                   self.contigs_expected[1]['file_position'],
                                   self.fasta_file.name,
                                   is_reference=False)

        self.ref_contig0 = FastaContig(
            self.contigs_expected[0]['name'],
            len(self.contigs_expected[0]['positions']),
            self.contigs_expected[0]['file_position'],
            self.fasta_file.name,
            is_reference=True)
        self.ref_contig1 = FastaContig(
            self.contigs_expected[1]['name'],
            len(self.contigs_expected[1]['positions']),
            self.contigs_expected[1]['file_position'],
            self.fasta_file.name,
            is_reference=True)
예제 #11
0
 def test_multi_base_call_raises_exception(self):
     # TODO: use appropriate exception
     with self.assertRaises(Exception):
         Position(call='gatc', coverage='-', proportion='-')