Пример #1
0
    def test_barcode_size_diff(self):
        # One mismatch, second barcode.
        barcodes = demultiplex.prepare_barcodes(self.barcodes5, None)
        entry1 = FastqEntry(
            '@header1',
            'TTAATT' + make_sequence(40),
            '+',
            make_quality_scores(44),
        )
        entry2 = FastqEntry(
            '@header2',
            'GGACGAGG' + make_sequence(40),
            '+',
            make_quality_scores(48),
        )
        fq_fname = self.create_fq_file([entry1, entry2])

        handle = demultiplex._extract(fq_fname,
                                      barcodes,
                                      mismatches=1,
                                      minimum_length=15)
        read1, wbrc, randomer1 = next(handle)
        self.assertEqual(wbrc, self.barcodes5[0])
        self.assertEqual(randomer1, 'TTT')
        self.assertEqual(read1.id, entry1.id)
        self.assertEqual(read1.seq, entry1.seq[6:])
        self.assertEqual(read1.plus, entry1.plus)
        self.assertEqual(read1.qual, entry1.qual[6:])
        read2, wbrc, randomer2 = next(handle)
        self.assertEqual(wbrc, self.barcodes5[2])
        self.assertEqual(randomer2, 'GGGG')
        self.assertEqual(read2.id, entry2.id)
        self.assertEqual(read2.seq, entry2.seq[8:])
        self.assertEqual(read2.plus, entry2.plus)
        self.assertEqual(read2.qual, entry2.qual[8:])
Пример #2
0
    def test_barcode_size_diff(self):
        # One mismatch, second barcode.
        barcodes = ['NAAANN', 'NNCCCNN']
        adapter = 'CCCCCC'
        data = [
            '@header1',
            'TACATT' + adapter + make_sequence(40, rnd_seed=0),
            '+',
            make_quality_scores(50, rnd_seed=0) + '!J',
            '@header2',
            'AACCCTT' + adapter + make_sequence(39, rnd_seed=0),
            '+',
            make_quality_scores(50, rnd_seed=0) + '!J',
        ]
        fq_fname = get_temp_file_name(extension='fq')
        fq_file = iCount.files.fastq.FastqFile(fq_fname, 'wt')
        fq_file.write(iCount.files.fastq.FastqEntry(*data[:4]))
        fq_file.write(iCount.files.fastq.FastqEntry(*data[4:]))
        fq_file.close()

        handle = demultiplex._extract(fq_fname, barcodes, mismatches=1)
        read1, exp_id1, randomer1 = next(handle)
        self.assertEqual(exp_id1, 0)
        self.assertEqual(randomer1, 'TTT')
        self.assertEqual(read1.id, data[0])
        self.assertEqual(read1.seq, data[1][6:])
        self.assertEqual(read1.plus, '+')
        self.assertEqual(read1.qual, data[3][6:])
        read2, exp_id2, randomer2 = next(handle)
        self.assertEqual(exp_id2, 1)
        self.assertEqual(randomer2, 'AATT')
        self.assertEqual(read2.id, data[4])
        self.assertEqual(read2.seq, data[5][7:])
        self.assertEqual(read2.plus, '+')
        self.assertEqual(read2.qual, data[7][7:])
Пример #3
0
    def setUp(self):
        warnings.simplefilter("ignore", ResourceWarning)

        self.dir = get_temp_dir()
        self.adapter = 'AAAAAAAAAA'
        self.barcodes5 = [
            'NNAAAN',
            'NGGGN',
            'NGGGN',
        ]
        self.barcodes3 = [
            '.',
            'NNGGG',
            'NCCC',
        ]
        # Header: early version Illumina header
        # Barcodes: exact match to the barcode set #1
        self.entry1 = FastqEntry(
            '@header1/1',
            'GGAAAG' + make_sequence(40) + self.adapter,
            '+',
            make_quality_scores(56),
        )
        # Header: contains id and description
        # Barcodes: one mismatch on 5' end for barcode set #2
        self.entry2 = FastqEntry(
            '@header2 blah',
            'AGGTA' + make_sequence(40) + 'AAGGG' + self.adapter,
            '+',
            make_quality_scores(60),
        )
        # Header: simple header
        # Barcodes: one mismatch on 3' end for barcode set #3
        self.entry3 = FastqEntry(
            '@header3',
            'TGGGT' + make_sequence(40) + 'TACC' + self.adapter,
            '+',
            make_quality_scores(59),
        )

        self.fq_fname = get_temp_file_name(extension='fq')
        self.fq_file = iCount.files.fastq.FastqFile(self.fq_fname, 'wt')
        for entry in [self.entry1, self.entry2, self.entry3]:
            self.fq_file.write(entry)
        self.fq_file.close()
Пример #4
0
    def test_barcode3(self):
        # One mismatch, second barcode.
        barcodes = demultiplex.prepare_barcodes(
            self.barcodes5 + [self.barcodes5[2]], self.barcodes3)
        barcodes = barcodes[self.barcodes5[2]]['barcodes3']
        entry1 = FastqEntry(
            '@header1',
            make_sequence(40) + 'TTGGG',
            '+',
            make_quality_scores(45),
        )
        entry2 = FastqEntry(
            '@header2',
            make_sequence(40) + 'TCAAA',
            '+',
            make_quality_scores(45),
        )
        fq_fname = self.create_fq_file([entry1, entry2])

        handle = demultiplex._extract(fq_fname,
                                      barcodes,
                                      mismatches=1,
                                      minimum_length=15)
        read1, wbrc, randomer1 = next(handle)
        self.assertEqual(wbrc, self.barcodes3[2])
        self.assertEqual(randomer1, 'TT')
        self.assertEqual(read1.id, entry1.id)
        self.assertEqual(read1.seq, entry1.seq[:-5])
        self.assertEqual(read1.plus, entry1.plus)
        self.assertEqual(read1.qual, entry1.qual[:-5])
        read2, wbrc, randomer2 = next(handle)
        self.assertEqual(wbrc, self.barcodes3[3])
        self.assertEqual(randomer2, 'TC')
        self.assertEqual(read2.id, entry2.id)
        self.assertEqual(read2.seq, entry2.seq[:-5])
        self.assertEqual(read2.plus, entry2.plus)
        self.assertEqual(read2.qual, entry2.qual[:-5])
Пример #5
0
    def test_extract_mismatch(self):
        # To many mismatches
        barcodes = demultiplex.prepare_barcodes(self.barcodes5, None)
        entry = FastqEntry(
            '@header1',
            'TTTTTT' + make_sequence(40),
            '+',
            make_quality_scores(46),
        )
        fq_fname = self.create_fq_file([entry])

        for read, wbrc, randomer in demultiplex._extract(fq_fname,
                                                         barcodes,
                                                         mismatches=0,
                                                         minimum_length=15):
            self.assertEqual(wbrc, 'nomatch')
            self.assertEqual(randomer, '')
            self.assertEqual(read.id, entry.id)
            self.assertEqual(read.seq, entry.seq)
            self.assertEqual(read.plus, entry.plus)
            self.assertEqual(read.qual, entry.qual)
Пример #6
0
    def test_extract_ok_2(self):
        # One mismatch, second barcode.
        barcodes = demultiplex.prepare_barcodes(self.barcodes5, None)
        entry = FastqEntry(
            '@header1',
            'TTAGTT' + make_sequence(40),
            '+',
            make_quality_scores(46),
        )
        fq_fname = self.create_fq_file([entry])

        for read, wbrc, randomer in demultiplex._extract(fq_fname,
                                                         barcodes,
                                                         mismatches=1,
                                                         minimum_length=15):
            self.assertEqual(wbrc, self.barcodes5[1])
            self.assertEqual(randomer, 'TTT')
            self.assertEqual(read.id, entry.id)
            self.assertEqual(read.seq, entry.seq[6:])
            self.assertEqual(read.plus, entry.plus)
            self.assertEqual(read.qual, entry.qual[6:])
Пример #7
0
    def test_extract_mismatch(self):
        # To many mismatches
        barcodes = ['NNAAAN', 'NNCCTN', 'NNACGN']
        adapter = 'CCCCCC'
        data = [
            '@header1', 'TTACTT' + adapter + make_sequence(40, rnd_seed=0),
            '+',
            make_quality_scores(50, rnd_seed=0) + '!J'
        ]
        fq_fname = get_temp_file_name(extension='fq')
        fq_file = iCount.files.fastq.FastqFile(fq_fname, 'wt')
        fq_file.write(iCount.files.fastq.FastqEntry(*data))
        fq_file.close()

        for read, exp_id, randomer in demultiplex._extract(fq_fname,
                                                           barcodes,
                                                           mismatches=0):
            self.assertEqual(exp_id, -1)
            self.assertEqual(randomer, '')
            self.assertEqual(read.id, data[0])
            self.assertEqual(read.seq, data[1])
            self.assertEqual(read.plus, '+')
            self.assertEqual(read.qual, data[3])
Пример #8
0
    def setUp(self):
        warnings.simplefilter("ignore", (ResourceWarning, ImportWarning))

        # Make fai and fasta file:
        self.fai_data = {'1': 1000}
        self.fai = make_file_from_list(list(self.fai_data.items()), bedtool=False, extension='fai')

        random.seed(0)  # pylint:disable=no-member
        random_seeds = random.randint(10**5, size=len(self.fai))  # pylint:disable=no-member
        self.seqs = {name: make_sequence(size, rnd_seed=rseed) for (name, size), rseed in
                     zip(self.fai_data.items(), random_seeds)}
        self.fasta = make_fasta_file(sequences=self.seqs.values(), headers=self.seqs.keys())

        # Make annotation:
        self.gtf = make_file_from_list(extension='gtf', data=[
            # Gene #1, positive strand:
            ['1', '.', 'gene', '100', '499', '.', '+', '.', attrs(gid='G1')],
            # Transcript #1
            ['1', '.', 'transcript', '100', '249', '.', '+', '.', attrs(gid='G1', tid='T1')],
            ['1', '.', 'exon', '100', '149', '.', '+', '.', attrs(gid='G1', tid='T1', exn=1)],
            ['1', '.', 'exon', '200', '229', '.', '+', '.', attrs(gid='G1', tid='T1', exn=2)],
            ['1', '.', 'CDS', '200', '229', '.', '+', '.', attrs(gid='G1', tid='T1', exn=2)],
            ['1', '.', 'exon', '240', '249', '.', '+', '.', attrs(gid='G1', tid='T1', exn=3)],
            # Transcript #2
            ['1', '.', 'transcript', '240', '499', '.', '+', '.', attrs(gid='G1', tid='T2')],
            ['1', '.', 'exon', '240', '299', '.', '+', '.', attrs(gid='G1', tid='T2', exn=1)],
            ['1', '.', 'CDS', '240', '299', '.', '+', '.', attrs(gid='G1', tid='T2', exn=1)],
            ['1', '.', 'exon', '400', '499', '.', '+', '.', attrs(gid='G1', tid='T2', exn=2)],
            ['1', '.', 'CDS', '400', '499', '.', '+', '.', attrs(gid='G1', tid='T2', exn=2)],

            # Gene #2, positive strand:
            ['1', '.', 'gene', '600', '799', '.', '+', '.', attrs(gid='G2')],
            # Transcript #3
            ['1', '.', 'transcript', '600', '799', '.', '+', '.', attrs(gid='G2', tid='T3')],
            ['1', '.', 'exon', '600', '649', '.', '+', '.', attrs(gid='G2', tid='T3', exn=1)],
            ['1', '.', 'CDS', '600', '649', '.', '+', '.', attrs(gid='G2', tid='T3', exn=1)],
            ['1', '.', 'exon', '750', '799', '.', '+', '.', attrs(gid='G2', tid='T3', exn=2)],
            ['1', '.', 'CDS', '750', '799', '.', '+', '.', attrs(gid='G2', tid='T3', exn=2)],

            # Gene #3, negative strand:
            ['1', '.', 'gene', '800', '899', '.', '-', '.', 'gene_id "G3";'],
            # Transcript #4
            ['1', '.', 'transcript', '800', '899', '.', '-', '.', attrs(gid='G3', tid='T4')],
            ['1', '.', 'exon', '800', '899', '.', '-', '.', attrs(gid='G3', tid='T4', exn=1)],
            ['1', '.', 'CDS', '800', '899', '.', '-', '.', attrs(gid='G3', tid='T4', exn=1)],
        ])

        # Define positions of cross-links:
        self.read_data = [
            # [name, chromosome, sequence_len, read_start, strand]
            ('name01:rbc:CCCC', '1', 50, 80, '+'),
            ('name02:rbc:CCCC', '1', 50, 140, '+'),
            ('name03:rbc:AAAA', '1', 50, 142, '+'),
            ('name04:rbc:CCCC', '1', 50, 142, '+'),
            ('name05:rbc:CCCC', '1', 30, 160, '+'),
            ('name06:rbc:CCCC', '1', 30, 163, '+'),
            ('name07:rbc:GGGG', '1', 30, 163, '+'),
            ('name08:rbc:CCCC', '1', 20, 205, '+'),
            ('name09:rbc:CCCC', '1', 50, 235, '+'),
            ('name10:rbc:CCCC', '1', 50, 480, '+'),
            ('name11:rbc:CCCC', '1', 30, 530, '+'),
            ('name12:rbc:CCCC', '1', 30, 610, '+'),
            ('name13:rbc:CCCC', '1', 30, 549, '-'),
        ]
        # Make reads from genome, pointing to specific pre-detemrined cross-link positions:
        random_seeds = random.randint(10**5, size=len(self.read_data))  # pylint:disable=no-member
        self.reads = get_temp_file_name(extension='fastq')
        with open(self.reads, 'wt') as ofile:
            for (name, chrom, length, xlink, strand), rseed in zip(self.read_data, random_seeds):
                seq = self.seqs[chrom][xlink: xlink + length]
                if strand == '-':
                    seq = self.seqs[chrom][xlink - length: xlink]
                    compl_bases = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
                    seq = ''.join([compl_bases[nuc] for nuc in seq[::-1]])

                ofile.write('@' + name + '\n')
                ofile.write(seq + '\n')
                ofile.write('+' + '\n')
                ofile.write(
                    make_quality_scores(len(seq), min_chr=65, max_chr=73, rnd_seed=rseed) + '\n')

            # Make a "strange" read:
            # pylint: disable=undefined-loop-variable
            seq = self.seqs[chrom][250: 295] + self.seqs[chrom][310: 380]
            ofile.write('@' + 'name_strange:rbc:GGGG' + '\n')
            ofile.write(seq + '\n')
            ofile.write('+' + '\n')
            ofile.write(
                make_quality_scores(len(seq), min_chr=70, max_chr=73, rnd_seed=rseed) + '\n')
Пример #9
0
    def test_run_ok_no_adapter(self):
        barcodes = ['NNAAAN', 'NNACTN']
        data = [
            [
                '@header/1', 'GGAAAG' + make_sequence(40, rnd_seed=99), '+',
                make_quality_scores(50, min_chr=65, max_chr=73, rnd_seed=76) +
                '!J'
            ],
            [
                '@header2 blah', 'TTCCTT' + make_sequence(40, rnd_seed=47),
                '+',
                make_quality_scores(50, min_chr=65, max_chr=73, rnd_seed=71) +
                '!J'
            ],
            [
                '@header3', 'TTGGGT' + make_sequence(40, rnd_seed=13), '+',
                make_quality_scores(50, min_chr=65, max_chr=73, rnd_seed=12) +
                '!J'
            ],
        ]
        fq_fname = get_temp_file_name(extension='fq')
        fq_file = iCount.files.fastq.FastqFile(fq_fname, 'wt')
        for line in data:
            fq_file.write(iCount.files.fastq.FastqEntry(*line))
        fq_file.close()

        demultiplex.run(fq_fname,
                        None,
                        barcodes,
                        mismatches=1,
                        out_dir=self.dir)

        fq1_list = make_list_from_file('{}/demux_{}.fastq.gz'.format(
            self.dir, barcodes[0]))
        expected1 = [
            ['@header:rbc:GGG/1'],
            [data[0][1][6:]],
            ['+'],
            [data[0][3][6:]],
        ]
        self.assertEqual(fq1_list, expected1)

        fq2_list = make_list_from_file('{}/demux_{}.fastq.gz'.format(
            self.dir, barcodes[1]))
        expected2 = [
            ['@header2:rbc:TTT'],
            [data[1][1][6:]],
            ['+'],
            [data[1][3][6:]],
        ]
        self.assertEqual(fq2_list, expected2)

        fq3_list = make_list_from_file('{}/demux_{}.fastq.gz'.format(
            self.dir, 'nomatch'))
        expected3 = [
            ['@header3'],
            [data[2][1]],
            ['+'],
            [data[2][3]],
        ]
        self.assertEqual(fq3_list, expected3)