def test_barcode_size_diff(self): # One mismatch, second barcode. barcodes = demultiplex.prepare_barcodes(self.barcodes5, None) entry1 = FastqEntry( '@header1', 'TTAATT' + make_sequence(40), '+', make_quality_scores(44), ) entry2 = FastqEntry( '@header2', 'GGACGAGG' + make_sequence(40), '+', make_quality_scores(48), ) fq_fname = self.create_fq_file([entry1, entry2]) handle = demultiplex._extract(fq_fname, barcodes, mismatches=1, minimum_length=15) read1, wbrc, randomer1 = next(handle) self.assertEqual(wbrc, self.barcodes5[0]) self.assertEqual(randomer1, 'TTT') self.assertEqual(read1.id, entry1.id) self.assertEqual(read1.seq, entry1.seq[6:]) self.assertEqual(read1.plus, entry1.plus) self.assertEqual(read1.qual, entry1.qual[6:]) read2, wbrc, randomer2 = next(handle) self.assertEqual(wbrc, self.barcodes5[2]) self.assertEqual(randomer2, 'GGGG') self.assertEqual(read2.id, entry2.id) self.assertEqual(read2.seq, entry2.seq[8:]) self.assertEqual(read2.plus, entry2.plus) self.assertEqual(read2.qual, entry2.qual[8:])
def test_barcode_size_diff(self): # One mismatch, second barcode. barcodes = ['NAAANN', 'NNCCCNN'] adapter = 'CCCCCC' data = [ '@header1', 'TACATT' + adapter + make_sequence(40, rnd_seed=0), '+', make_quality_scores(50, rnd_seed=0) + '!J', '@header2', 'AACCCTT' + adapter + make_sequence(39, rnd_seed=0), '+', make_quality_scores(50, rnd_seed=0) + '!J', ] fq_fname = get_temp_file_name(extension='fq') fq_file = iCount.files.fastq.FastqFile(fq_fname, 'wt') fq_file.write(iCount.files.fastq.FastqEntry(*data[:4])) fq_file.write(iCount.files.fastq.FastqEntry(*data[4:])) fq_file.close() handle = demultiplex._extract(fq_fname, barcodes, mismatches=1) read1, exp_id1, randomer1 = next(handle) self.assertEqual(exp_id1, 0) self.assertEqual(randomer1, 'TTT') self.assertEqual(read1.id, data[0]) self.assertEqual(read1.seq, data[1][6:]) self.assertEqual(read1.plus, '+') self.assertEqual(read1.qual, data[3][6:]) read2, exp_id2, randomer2 = next(handle) self.assertEqual(exp_id2, 1) self.assertEqual(randomer2, 'AATT') self.assertEqual(read2.id, data[4]) self.assertEqual(read2.seq, data[5][7:]) self.assertEqual(read2.plus, '+') self.assertEqual(read2.qual, data[7][7:])
def setUp(self): warnings.simplefilter("ignore", ResourceWarning) self.dir = get_temp_dir() self.adapter = 'AAAAAAAAAA' self.barcodes5 = [ 'NNAAAN', 'NGGGN', 'NGGGN', ] self.barcodes3 = [ '.', 'NNGGG', 'NCCC', ] # Header: early version Illumina header # Barcodes: exact match to the barcode set #1 self.entry1 = FastqEntry( '@header1/1', 'GGAAAG' + make_sequence(40) + self.adapter, '+', make_quality_scores(56), ) # Header: contains id and description # Barcodes: one mismatch on 5' end for barcode set #2 self.entry2 = FastqEntry( '@header2 blah', 'AGGTA' + make_sequence(40) + 'AAGGG' + self.adapter, '+', make_quality_scores(60), ) # Header: simple header # Barcodes: one mismatch on 3' end for barcode set #3 self.entry3 = FastqEntry( '@header3', 'TGGGT' + make_sequence(40) + 'TACC' + self.adapter, '+', make_quality_scores(59), ) self.fq_fname = get_temp_file_name(extension='fq') self.fq_file = iCount.files.fastq.FastqFile(self.fq_fname, 'wt') for entry in [self.entry1, self.entry2, self.entry3]: self.fq_file.write(entry) self.fq_file.close()
def test_barcode3(self): # One mismatch, second barcode. barcodes = demultiplex.prepare_barcodes( self.barcodes5 + [self.barcodes5[2]], self.barcodes3) barcodes = barcodes[self.barcodes5[2]]['barcodes3'] entry1 = FastqEntry( '@header1', make_sequence(40) + 'TTGGG', '+', make_quality_scores(45), ) entry2 = FastqEntry( '@header2', make_sequence(40) + 'TCAAA', '+', make_quality_scores(45), ) fq_fname = self.create_fq_file([entry1, entry2]) handle = demultiplex._extract(fq_fname, barcodes, mismatches=1, minimum_length=15) read1, wbrc, randomer1 = next(handle) self.assertEqual(wbrc, self.barcodes3[2]) self.assertEqual(randomer1, 'TT') self.assertEqual(read1.id, entry1.id) self.assertEqual(read1.seq, entry1.seq[:-5]) self.assertEqual(read1.plus, entry1.plus) self.assertEqual(read1.qual, entry1.qual[:-5]) read2, wbrc, randomer2 = next(handle) self.assertEqual(wbrc, self.barcodes3[3]) self.assertEqual(randomer2, 'TC') self.assertEqual(read2.id, entry2.id) self.assertEqual(read2.seq, entry2.seq[:-5]) self.assertEqual(read2.plus, entry2.plus) self.assertEqual(read2.qual, entry2.qual[:-5])
def test_extract_mismatch(self): # To many mismatches barcodes = demultiplex.prepare_barcodes(self.barcodes5, None) entry = FastqEntry( '@header1', 'TTTTTT' + make_sequence(40), '+', make_quality_scores(46), ) fq_fname = self.create_fq_file([entry]) for read, wbrc, randomer in demultiplex._extract(fq_fname, barcodes, mismatches=0, minimum_length=15): self.assertEqual(wbrc, 'nomatch') self.assertEqual(randomer, '') self.assertEqual(read.id, entry.id) self.assertEqual(read.seq, entry.seq) self.assertEqual(read.plus, entry.plus) self.assertEqual(read.qual, entry.qual)
def test_extract_ok_2(self): # One mismatch, second barcode. barcodes = demultiplex.prepare_barcodes(self.barcodes5, None) entry = FastqEntry( '@header1', 'TTAGTT' + make_sequence(40), '+', make_quality_scores(46), ) fq_fname = self.create_fq_file([entry]) for read, wbrc, randomer in demultiplex._extract(fq_fname, barcodes, mismatches=1, minimum_length=15): self.assertEqual(wbrc, self.barcodes5[1]) self.assertEqual(randomer, 'TTT') self.assertEqual(read.id, entry.id) self.assertEqual(read.seq, entry.seq[6:]) self.assertEqual(read.plus, entry.plus) self.assertEqual(read.qual, entry.qual[6:])
def test_extract_mismatch(self): # To many mismatches barcodes = ['NNAAAN', 'NNCCTN', 'NNACGN'] adapter = 'CCCCCC' data = [ '@header1', 'TTACTT' + adapter + make_sequence(40, rnd_seed=0), '+', make_quality_scores(50, rnd_seed=0) + '!J' ] fq_fname = get_temp_file_name(extension='fq') fq_file = iCount.files.fastq.FastqFile(fq_fname, 'wt') fq_file.write(iCount.files.fastq.FastqEntry(*data)) fq_file.close() for read, exp_id, randomer in demultiplex._extract(fq_fname, barcodes, mismatches=0): self.assertEqual(exp_id, -1) self.assertEqual(randomer, '') self.assertEqual(read.id, data[0]) self.assertEqual(read.seq, data[1]) self.assertEqual(read.plus, '+') self.assertEqual(read.qual, data[3])
def setUp(self): warnings.simplefilter("ignore", (ResourceWarning, ImportWarning)) # Make fai and fasta file: self.fai_data = {'1': 1000} self.fai = make_file_from_list(list(self.fai_data.items()), bedtool=False, extension='fai') random.seed(0) # pylint:disable=no-member random_seeds = random.randint(10**5, size=len(self.fai)) # pylint:disable=no-member self.seqs = {name: make_sequence(size, rnd_seed=rseed) for (name, size), rseed in zip(self.fai_data.items(), random_seeds)} self.fasta = make_fasta_file(sequences=self.seqs.values(), headers=self.seqs.keys()) # Make annotation: self.gtf = make_file_from_list(extension='gtf', data=[ # Gene #1, positive strand: ['1', '.', 'gene', '100', '499', '.', '+', '.', attrs(gid='G1')], # Transcript #1 ['1', '.', 'transcript', '100', '249', '.', '+', '.', attrs(gid='G1', tid='T1')], ['1', '.', 'exon', '100', '149', '.', '+', '.', attrs(gid='G1', tid='T1', exn=1)], ['1', '.', 'exon', '200', '229', '.', '+', '.', attrs(gid='G1', tid='T1', exn=2)], ['1', '.', 'CDS', '200', '229', '.', '+', '.', attrs(gid='G1', tid='T1', exn=2)], ['1', '.', 'exon', '240', '249', '.', '+', '.', attrs(gid='G1', tid='T1', exn=3)], # Transcript #2 ['1', '.', 'transcript', '240', '499', '.', '+', '.', attrs(gid='G1', tid='T2')], ['1', '.', 'exon', '240', '299', '.', '+', '.', attrs(gid='G1', tid='T2', exn=1)], ['1', '.', 'CDS', '240', '299', '.', '+', '.', attrs(gid='G1', tid='T2', exn=1)], ['1', '.', 'exon', '400', '499', '.', '+', '.', attrs(gid='G1', tid='T2', exn=2)], ['1', '.', 'CDS', '400', '499', '.', '+', '.', attrs(gid='G1', tid='T2', exn=2)], # Gene #2, positive strand: ['1', '.', 'gene', '600', '799', '.', '+', '.', attrs(gid='G2')], # Transcript #3 ['1', '.', 'transcript', '600', '799', '.', '+', '.', attrs(gid='G2', tid='T3')], ['1', '.', 'exon', '600', '649', '.', '+', '.', attrs(gid='G2', tid='T3', exn=1)], ['1', '.', 'CDS', '600', '649', '.', '+', '.', attrs(gid='G2', tid='T3', exn=1)], ['1', '.', 'exon', '750', '799', '.', '+', '.', attrs(gid='G2', tid='T3', exn=2)], ['1', '.', 'CDS', '750', '799', '.', '+', '.', attrs(gid='G2', tid='T3', exn=2)], # Gene #3, negative strand: ['1', '.', 'gene', '800', '899', '.', '-', '.', 'gene_id "G3";'], # Transcript #4 ['1', '.', 'transcript', '800', '899', '.', '-', '.', attrs(gid='G3', tid='T4')], ['1', '.', 'exon', '800', '899', '.', '-', '.', attrs(gid='G3', tid='T4', exn=1)], ['1', '.', 'CDS', '800', '899', '.', '-', '.', attrs(gid='G3', tid='T4', exn=1)], ]) # Define positions of cross-links: self.read_data = [ # [name, chromosome, sequence_len, read_start, strand] ('name01:rbc:CCCC', '1', 50, 80, '+'), ('name02:rbc:CCCC', '1', 50, 140, '+'), ('name03:rbc:AAAA', '1', 50, 142, '+'), ('name04:rbc:CCCC', '1', 50, 142, '+'), ('name05:rbc:CCCC', '1', 30, 160, '+'), ('name06:rbc:CCCC', '1', 30, 163, '+'), ('name07:rbc:GGGG', '1', 30, 163, '+'), ('name08:rbc:CCCC', '1', 20, 205, '+'), ('name09:rbc:CCCC', '1', 50, 235, '+'), ('name10:rbc:CCCC', '1', 50, 480, '+'), ('name11:rbc:CCCC', '1', 30, 530, '+'), ('name12:rbc:CCCC', '1', 30, 610, '+'), ('name13:rbc:CCCC', '1', 30, 549, '-'), ] # Make reads from genome, pointing to specific pre-detemrined cross-link positions: random_seeds = random.randint(10**5, size=len(self.read_data)) # pylint:disable=no-member self.reads = get_temp_file_name(extension='fastq') with open(self.reads, 'wt') as ofile: for (name, chrom, length, xlink, strand), rseed in zip(self.read_data, random_seeds): seq = self.seqs[chrom][xlink: xlink + length] if strand == '-': seq = self.seqs[chrom][xlink - length: xlink] compl_bases = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} seq = ''.join([compl_bases[nuc] for nuc in seq[::-1]]) ofile.write('@' + name + '\n') ofile.write(seq + '\n') ofile.write('+' + '\n') ofile.write( make_quality_scores(len(seq), min_chr=65, max_chr=73, rnd_seed=rseed) + '\n') # Make a "strange" read: # pylint: disable=undefined-loop-variable seq = self.seqs[chrom][250: 295] + self.seqs[chrom][310: 380] ofile.write('@' + 'name_strange:rbc:GGGG' + '\n') ofile.write(seq + '\n') ofile.write('+' + '\n') ofile.write( make_quality_scores(len(seq), min_chr=70, max_chr=73, rnd_seed=rseed) + '\n')
def test_run_ok_no_adapter(self): barcodes = ['NNAAAN', 'NNACTN'] data = [ [ '@header/1', 'GGAAAG' + make_sequence(40, rnd_seed=99), '+', make_quality_scores(50, min_chr=65, max_chr=73, rnd_seed=76) + '!J' ], [ '@header2 blah', 'TTCCTT' + make_sequence(40, rnd_seed=47), '+', make_quality_scores(50, min_chr=65, max_chr=73, rnd_seed=71) + '!J' ], [ '@header3', 'TTGGGT' + make_sequence(40, rnd_seed=13), '+', make_quality_scores(50, min_chr=65, max_chr=73, rnd_seed=12) + '!J' ], ] fq_fname = get_temp_file_name(extension='fq') fq_file = iCount.files.fastq.FastqFile(fq_fname, 'wt') for line in data: fq_file.write(iCount.files.fastq.FastqEntry(*line)) fq_file.close() demultiplex.run(fq_fname, None, barcodes, mismatches=1, out_dir=self.dir) fq1_list = make_list_from_file('{}/demux_{}.fastq.gz'.format( self.dir, barcodes[0])) expected1 = [ ['@header:rbc:GGG/1'], [data[0][1][6:]], ['+'], [data[0][3][6:]], ] self.assertEqual(fq1_list, expected1) fq2_list = make_list_from_file('{}/demux_{}.fastq.gz'.format( self.dir, barcodes[1])) expected2 = [ ['@header2:rbc:TTT'], [data[1][1][6:]], ['+'], [data[1][3][6:]], ] self.assertEqual(fq2_list, expected2) fq3_list = make_list_from_file('{}/demux_{}.fastq.gz'.format( self.dir, 'nomatch')) expected3 = [ ['@header3'], [data[2][1]], ['+'], [data[2][3]], ] self.assertEqual(fq3_list, expected3)