Пример #1
0
    def test_write_counts_file(self):
        """BarcodeCounter: write counts to a file
        """
        # Write a file
        self._make_working_dir()
        bc = BarcodeCounter()
        bc.count_barcode("TATGCGCGGTA", lane=1, incr=285302)
        bc.count_barcode("TATGCGCGGTG", lane=1, incr=532)
        bc.count_barcode("ACCTACCGGTA", lane=1, incr=315)
        bc.count_barcode("CCCTTATGCGA", lane=1, incr=22)
        bc.count_barcode("ACCTAGCGGTA", lane=2, incr=477)
        bc.count_barcode("ACCTCTATGCT", lane=2, incr=368)
        bc.count_barcode("ACCCTNCGGTA", lane=3, incr=312)
        bc.count_barcode("ACCTTATGCGC", lane=3, incr=248)
        counts_file = os.path.join(self.wd, "out.counts")
        bc.write(counts_file)
        expected_contents = """#Lane	Rank	Sequence	Count
1	1	TATGCGCGGTA	285302
1	2	TATGCGCGGTG	532
1	3	ACCTACCGGTA	315
1	4	CCCTTATGCGA	22
2	1	ACCTAGCGGTA	477
2	2	ACCTCTATGCT	368
3	1	ACCCTNCGGTA	312
3	2	ACCTTATGCGC	248
"""
        self.assertTrue(os.path.exists(counts_file))
        self.assertEqual(open(counts_file, 'r').read(), expected_contents)
Пример #2
0
 def test_count_fastq_sequences(self):
     """BarcodeCounter: count barcode sequences
     """
     # Initialise counter object
     bc = BarcodeCounter()
     # Populate with sequences
     for r, incr in (((1, "AGGCAGAATCTTACGC"),
                      102), ((1, "TCCTGAGCTCTTACGC"),
                             10), ((1, "ACAGTGATTCTTTCCC"),
                                   3), ((1, "ATGCTCGTCTCGCATC"),
                                        1), ((2, "CGTACTAGTCTTACGC"), 95),
                     ((2, "ATGTCAGATCTTTCCC"),
                      29), ((2, "AGGCAGAATCTTACGC"),
                            12), ((2, "CAGATCATTCTTTCCC"),
                                  6), ((3, "GGACTCCTTCTTACGC"),
                                       75), ((3, "ACCGATTCGCGCGTAG"), 74),
                     ((3, "CCAGCAATATCGCGAG"), 2), ((3, "CCGCGTAAGCAATAGA"),
                                                    1)):
         lane, seq = r
         for i in xrange(incr):
             bc.count_barcode(seq, lane=lane)
     # Check contents
     self.assertEqual(bc.barcodes(), [
         "AGGCAGAATCTTACGC", "CGTACTAGTCTTACGC", "GGACTCCTTCTTACGC",
         "ACCGATTCGCGCGTAG", "ATGTCAGATCTTTCCC", "TCCTGAGCTCTTACGC",
         "CAGATCATTCTTTCCC", "ACAGTGATTCTTTCCC", "CCAGCAATATCGCGAG",
         "ATGCTCGTCTCGCATC", "CCGCGTAAGCAATAGA"
     ])
     # Lanes
     self.assertEqual(bc.lanes, [1, 2, 3])
     # Counts for individual barcodes
     self.assertEqual(bc.counts("AGGCAGAATCTTACGC"), 114)
     self.assertEqual(bc.counts("AGGCAGAATCTTACGC", lane=1), 102)
     self.assertEqual(bc.counts("AGGCAGAATCTTACGC", lane=2), 12)
     self.assertEqual(bc.counts("AGGCAGAATCTTACGC", lane=3), 0)
     self.assertEqual(bc.counts_all("AGGCAGAATCTTACGC"), 114)
     self.assertEqual(bc.counts("CCGCGTAAGCAATAGA"), 1)
     self.assertEqual(bc.counts("CCGCGTAAGCAATAGA", lane=1), 0)
     self.assertEqual(bc.counts("CCGCGTAAGCAATAGA", lane=2), 0)
     self.assertEqual(bc.counts("CCGCGTAAGCAATAGA", lane=3), 1)
     self.assertEqual(bc.counts_all("CCGCGTAAGCAATAGA"), 1)
     # Read counts
     self.assertEqual(bc.nreads(), 410)
     self.assertEqual(bc.nreads(1), 116)
     self.assertEqual(bc.nreads(2), 142)
     self.assertEqual(bc.nreads(3), 152)
     # Lengths
     self.assertEqual(bc.barcode_lengths(), [16])
     self.assertEqual(bc.barcode_lengths(1), [16])
     self.assertEqual(bc.barcode_lengths(2), [16])
     self.assertEqual(bc.barcode_lengths(3), [16])
Пример #3
0
    def test_analyse_groups_with_sample_sheet(self):
        """BarcodeCounter: perform analysis with grouping and samplesheet
        """
        # Create sample sheet
        sample_sheet_file = self._make_file(
            "SampleSheet.csv", """[Data]
Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description
1,SMPL1,,,,A006,CATGCGCGGTA,,
1,SMPL2,,,,A012,GCTGCGCGGTC,,
2,SMPL3,,,,A005,ACAGTGCGGTA,,
2,SMPL4,,,,A019,GTGAAACGGTC,,
""")
        # Set up barcode counts
        bc = BarcodeCounter()
        bc.count_barcode("TATGCGCGGTA", lane=1, incr=285302)
        bc.count_barcode("CATGCGCGGTA", lane=1, incr=8532)
        bc.count_barcode("GATGCGCGGTA", lane=1, incr=5321)
        bc.count_barcode("GCTGCGCGGTA", lane=1, incr=7853)
        bc.count_barcode("GCTGCGCGGTC", lane=1, incr=325394)
        analysis = bc.analyse(lane=1,
                              mismatches=2,
                              sample_sheet=sample_sheet_file)
        ##"CATGCGCGGTA","TATGCGCGGTA","GATGCGCGGTA","GCTGCGCGGTA" = 307008
        ##"GCTGCGCGGTC" = 325394
        self.assertEqual(analysis.cutoff, None)
        self.assertEqual(analysis.mismatches, 2)
        self.assertEqual(analysis.total_reads, 632402)
        self.assertEqual(analysis.coverage, 632402)
        self.assertEqual(analysis.barcodes, ["GCTGCGCGGTC", "CATGCGCGGTA"])
        self.assertEqual(analysis.counts["GCTGCGCGGTC"].reads, 325394)
        self.assertEqual(analysis.counts["CATGCGCGGTA"].reads, 307008)
        self.assertEqual(analysis.counts["GCTGCGCGGTC"].sample, "SMPL2")
        self.assertEqual(analysis.counts["CATGCGCGGTA"].sample, "SMPL1")
        self.assertEqual(analysis.counts["GCTGCGCGGTC"].sequences, 1)
        self.assertEqual(analysis.counts["CATGCGCGGTA"].sequences, 4)
Пример #4
0
 def test_analyse_with_cutoff(self):
     """BarcodeCounter: perform analysis with cutoff
     """
     bc = BarcodeCounter()
     bc.count_barcode("TATGCGCGGTA", lane=1, incr=285302)
     bc.count_barcode("CATGCGCGGTA", lane=1, incr=8532)
     bc.count_barcode("GATGCGCGGTA", lane=1, incr=5321)
     bc.count_barcode("GCTGCGCGGTA", lane=1, incr=7853)
     bc.count_barcode("GCTGCGCGGTC", lane=1, incr=325394)
     analysis = bc.analyse(lane=1, cutoff=0.013)
     self.assertEqual(analysis.cutoff, 0.013)
     self.assertEqual(analysis.mismatches, 0)
     self.assertEqual(analysis.total_reads, 632402)
     self.assertEqual(analysis.coverage, 619228)
     self.assertEqual(analysis.barcodes,
                      ["GCTGCGCGGTC", "TATGCGCGGTA", "CATGCGCGGTA"])
     self.assertEqual(analysis.counts["GCTGCGCGGTC"].reads, 325394)
     self.assertEqual(analysis.counts["TATGCGCGGTA"].reads, 285302)
     self.assertEqual(analysis.counts["CATGCGCGGTA"].reads, 8532)
     self.assertEqual(analysis.counts["GCTGCGCGGTC"].sample, None)
     self.assertEqual(analysis.counts["TATGCGCGGTA"].sample, None)
     self.assertEqual(analysis.counts["CATGCGCGGTA"].sample, None)
     self.assertEqual(analysis.counts["GCTGCGCGGTC"].sequences, 1)
     self.assertEqual(analysis.counts["TATGCGCGGTA"].sequences, 1)
     self.assertEqual(analysis.counts["CATGCGCGGTA"].sequences, 1)
Пример #5
0
 def test_filter_barcodes(self):
     """BarcodeCounter: check filtering by lane and cutoff
     """
     bc = BarcodeCounter()
     bc.count_barcode("TATGCGCGGTA", lane=1, incr=285302)
     bc.count_barcode("TATGCGCGGTG", lane=1, incr=532)
     bc.count_barcode("ACCTACCGGTA", lane=1, incr=315)
     bc.count_barcode("CCCTTATGCGA", lane=1, incr=22)
     bc.count_barcode("ACCTAGCGGTA", lane=2, incr=477)
     bc.count_barcode("ACCTCTATGCT", lane=2, incr=368)
     self.assertEqual(bc.barcodes(), [
         "TATGCGCGGTA", "TATGCGCGGTG", "ACCTAGCGGTA", "ACCTCTATGCT",
         "ACCTACCGGTA", "CCCTTATGCGA"
     ])
     # No filtering
     self.assertEqual(bc.filter_barcodes(), [
         "TATGCGCGGTA", "TATGCGCGGTG", "ACCTAGCGGTA", "ACCTCTATGCT",
         "ACCTACCGGTA", "CCCTTATGCGA"
     ])
     # Filter by lane
     self.assertEqual(
         bc.filter_barcodes(lane=1),
         ["TATGCGCGGTA", "TATGCGCGGTG", "ACCTACCGGTA", "CCCTTATGCGA"]),
     self.assertEqual(bc.filter_barcodes(lane=2),
                      ["ACCTAGCGGTA", "ACCTCTATGCT"])
     # Filter by cutoff
     self.assertEqual(bc.filter_barcodes(cutoff=0.5), [
         "TATGCGCGGTA",
     ])
     self.assertEqual(bc.filter_barcodes(cutoff=0.0015, lane=1),
                      ["TATGCGCGGTA", "TATGCGCGGTG"])
     self.assertEqual(bc.filter_barcodes(cutoff=0.5, lane=2), [
         "ACCTAGCGGTA",
     ])
Пример #6
0
def count_barcodes(fastqs):
    """
    Count the barcodes from multiple fastqs

    """
    print "Reading in %s fastq%s" % (len(fastqs),
                                     ('' if len(fastqs) == 1 else 's'))
    counts = BarcodeCounter()
    for fq in fastqs:
        print "%s" % os.path.basename(fq)
        for r in FastqIterator(fq):
            seq = r.seqid.index_sequence
            lane = int(r.seqid.flowcell_lane)
            counts.count_barcode(seq, lane)
    return counts
Пример #7
0
    def test_report_barcodes(self):
        """report_barcodes: check output for mismatches and sample sheet
        """
        # Create sample sheet
        sample_sheet_file = self._make_file(
            "SampleSheet.csv", """[Data]
Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description
1,SMPL1,,,,A006,CATGCGCGGTA,,
1,SMPL2,,,,A012,GCTGCGCGGTC,,
""")
        # Set up barcode counts
        bc = BarcodeCounter()
        bc.count_barcode("TATGCGCGGTA", lane=1, incr=285302)
        bc.count_barcode("CATGCGCGGTA", lane=1, incr=8532)
        bc.count_barcode("GATGCGCGGTA", lane=1, incr=5321)
        bc.count_barcode("GCTGCGCGGTA", lane=1, incr=7853)
        bc.count_barcode("GCTGCGCGGTC", lane=1, incr=325394)
        analysis = bc.analyse(lane=1,
                              mismatches=2,
                              sample_sheet=sample_sheet_file)
        ##"CATGCGCGGTA","TATGCGCGGTA","GATGCGCGGTA","GCTGCGCGGTA" = 307008
        ##"GCTGCGCGGTC" = 325394
        self.assertEqual(analysis.cutoff, None)
        self.assertEqual(analysis.mismatches, 2)
        self.assertEqual(analysis.total_reads, 632402)
        self.assertEqual(analysis.coverage, 632402)
        self.assertEqual(analysis.barcodes, ["GCTGCGCGGTC", "CATGCGCGGTA"])
        self.assertEqual(analysis.counts["GCTGCGCGGTC"].reads, 325394)
        self.assertEqual(analysis.counts["CATGCGCGGTA"].reads, 307008)
        self.assertEqual(analysis.counts["GCTGCGCGGTC"].sample, "SMPL2")
        self.assertEqual(analysis.counts["CATGCGCGGTA"].sample, "SMPL1")
        self.assertEqual(analysis.counts["GCTGCGCGGTC"].sequences, 1)
        self.assertEqual(analysis.counts["CATGCGCGGTA"].sequences, 4)
        # Create report
        reporter = report_barcodes(bc,
                                   lane=1,
                                   mismatches=2,
                                   sample_sheet=sample_sheet_file)
        # Check content
        self.assertEqual(
            str(reporter), """Barcode analysis for lane #1
============================
 * Barcodes have been grouped by allowing 2 mismatches

#Rank	Index	Sample	N_seqs	N_reads	%reads	(%Total_reads)
    1	GCTGCGCGGTC	SMPL2	1	325394	51.5%	(51.5%)
    2	CATGCGCGGTA	SMPL1	4	307008	48.5%	(100.0%)""")
Пример #8
0
 def test_analyse_groups(self):
     """BarcodeCounter: perform analysis with grouping
     """
     bc = BarcodeCounter()
     bc.count_barcode("TATGCGCGGTA", lane=1, incr=285302)
     bc.count_barcode("CATGCGCGGTA", lane=1, incr=8532)
     bc.count_barcode("GATGCGCGGTA", lane=1, incr=5321)
     bc.count_barcode("GCTGCGCGGTA", lane=1, incr=7853)
     bc.count_barcode("GCTGCGCGGTC", lane=1, incr=325394)
     analysis = bc.analyse(lane=1, mismatches=1)
     ##"TATGCGCGGTA","CATGCGCGGTA","GATGCGCGGTA" = 299155
     ##"GCTGCGCGGTC","GCTGCGCGGTA" = 333247
     self.assertEqual(analysis.cutoff, None)
     self.assertEqual(analysis.mismatches, 1)
     self.assertEqual(analysis.total_reads, 632402)
     self.assertEqual(analysis.coverage, 632402)
     self.assertEqual(analysis.barcodes, ["GCTGCGCGGTC", "TATGCGCGGTA"])
     self.assertEqual(analysis.counts["GCTGCGCGGTC"].reads, 333247)
     self.assertEqual(analysis.counts["TATGCGCGGTA"].reads, 299155)
     self.assertEqual(analysis.counts["GCTGCGCGGTC"].sample, None)
     self.assertEqual(analysis.counts["TATGCGCGGTA"].sample, None)
     self.assertEqual(analysis.counts["GCTGCGCGGTC"].sequences, 2)
     self.assertEqual(analysis.counts["TATGCGCGGTA"].sequences, 3)
Пример #9
0
 def test_group(self):
     """BarcodeCounter: check grouping of barcode sequences
     """
     bc = BarcodeCounter()
     bc.count_barcode("TATGCGCGGTA", lane=1, incr=285302)
     bc.count_barcode("CATGCGCGGTA", lane=1, incr=8532)
     bc.count_barcode("GATGCGCGGTA", lane=1, incr=5321)
     bc.count_barcode("GCTGCGCGGTA", lane=1, incr=7853)
     bc.count_barcode("GCTGCGCGGTC", lane=1, incr=325394)
     bc.count_barcode("GTCACGCGGTA", lane=2, incr=296201)
     bc.count_barcode("GTCACGCGGTT", lane=2, incr=2853)
     bc.count_barcode("GTCACGCTGTT", lane=2, incr=278539)
     ## 2 mismatches across all lanes
     groups = bc.group(None, mismatches=2)
     ##"GCTGCGCGGTC","GCTGCGCGGTA","GATGCGCGGTA" = 338568
     ##"TATGCGCGGTA","CATGCGCGGTA" = 293834
     ##"GTCACGCGGTA","GTCACGCTGTT","GTCACGCGGTT" = 577593
     self.assertEqual(len(groups), 3)
     self.assertEqual(groups[0].reference, "GTCACGCGGTA")
     self.assertEqual(groups[0].sequences,
                      ["GTCACGCGGTA", "GTCACGCTGTT", "GTCACGCGGTT"])
     self.assertEqual(groups[0].counts, 577593)
     self.assertEqual(groups[1].reference, "GCTGCGCGGTC")
     self.assertEqual(groups[1].sequences,
                      ["GCTGCGCGGTC", "GCTGCGCGGTA", "GATGCGCGGTA"])
     self.assertEqual(groups[1].counts, 338568)
     self.assertEqual(groups[2].reference, "TATGCGCGGTA")
     self.assertEqual(groups[2].sequences, ["TATGCGCGGTA", "CATGCGCGGTA"])
     self.assertEqual(groups[2].counts, 293834)
     ## 1 mismatch across all lanes
     groups = bc.group(None, mismatches=1)
     ##"TATGCGCGGTA","CATGCGCGGTA","GATGCGCGGTA" = 299155
     ##"GCTGCGCGGTC","GCTGCGCGGTA" = 333247
     ##"GTCACGCGGTA","GTCACGCGGTT" = 299054
     ##"GTCACGCTGTT" = 278539
     self.assertEqual(len(groups), 4)
     self.assertEqual(groups[0].reference, "GCTGCGCGGTC")
     self.assertEqual(groups[0].sequences, ["GCTGCGCGGTC", "GCTGCGCGGTA"])
     self.assertEqual(groups[0].counts, 333247)
     self.assertEqual(groups[1].reference, "TATGCGCGGTA")
     self.assertEqual(groups[1].sequences,
                      ["TATGCGCGGTA", "CATGCGCGGTA", "GATGCGCGGTA"])
     self.assertEqual(groups[1].counts, 299155)
     self.assertEqual(groups[2].reference, "GTCACGCGGTA")
     self.assertEqual(groups[2].sequences, ["GTCACGCGGTA", "GTCACGCGGTT"])
     self.assertEqual(groups[2].counts, 299054)
     self.assertEqual(groups[3].reference, "GTCACGCTGTT")
     self.assertEqual(groups[3].sequences, [
         "GTCACGCTGTT",
     ])
     self.assertEqual(groups[3].counts, 278539)
     ## 1 mismatch in lane 1
     groups = bc.group(1, mismatches=1)
     ##"TATGCGCGGTA","CATGCGCGGTA","GATGCGCGGTA" = 299155
     ##"GCTGCGCGGTC","GCTGCGCGGTA" = 333247
     self.assertEqual(len(groups), 2)
     self.assertEqual(groups[0].reference, "GCTGCGCGGTC")
     self.assertEqual(groups[0].sequences, ["GCTGCGCGGTC", "GCTGCGCGGTA"])
     self.assertEqual(groups[0].counts, 333247)
     self.assertEqual(groups[1].reference, "TATGCGCGGTA")
     self.assertEqual(groups[1].sequences,
                      ["TATGCGCGGTA", "CATGCGCGGTA", "GATGCGCGGTA"])
     self.assertEqual(groups[1].counts, 299155)
     ## 2 mismatches across all lanes
     groups = bc.group(None, mismatches=2)
     ##"GCTGCGCGGTC","GCTGCGCGGTA","GATGCGCGGTA" = 338568
     ##"TATGCGCGGTA","CATGCGCGGTA" = 293834
     ##"GTCACGCGGTA","GTCACGCTGTT","GTCACGCGGTT" = 577593
     self.assertEqual(len(groups), 3)
     self.assertEqual(groups[0].reference, "GTCACGCGGTA")
     self.assertEqual(groups[0].sequences,
                      ["GTCACGCGGTA", "GTCACGCTGTT", "GTCACGCGGTT"])
     self.assertEqual(groups[0].counts, 577593)
     self.assertEqual(groups[1].reference, "GCTGCGCGGTC")
     self.assertEqual(groups[1].sequences,
                      ["GCTGCGCGGTC", "GCTGCGCGGTA", "GATGCGCGGTA"])
     self.assertEqual(groups[1].counts, 338568)
     self.assertEqual(groups[2].reference, "TATGCGCGGTA")
     self.assertEqual(groups[2].sequences, ["TATGCGCGGTA", "CATGCGCGGTA"])
     self.assertEqual(groups[2].counts, 293834)