def test_04_check_qual(self): count_of_first50 = 0 self._fp.seek( 0 ) for num_reads, fastq_read in enumerate(fastqReader(self._fp, format = self._format)): quality_list = fastq_read.get_decimal_quality_scores() self._illumina_filtering.quality_list = [31, 31, 31, 37, 35, 35, 35, 35, 37, 39, 37, 35, 39, 40, 38, 38, 40, 40, 40, 38, 38, 40, 38, 36, 39, 40, 38, 41, 39, 39, 40, 40, 41, 41, 41, 38, 38, 38, 38, 40, 41, 41, 40, 40, 40, 41, 40, 37, 39, 37, 35, 28, 33, 35, 35, 20, 22, 20, 22, 27, 30, 34, 34, 35, 26, 33, 33, 33, 29, 34, 31, 34, 35, 35, 34, 34, 34, 33, 35, 29, 33, 33, 29, 35, 35, 35, 35, 33, 29, 33, 27, 31, 34, 34, 35, 32, 31, 31, 31, 34, 34] self._illumina_filtering.count_of_first50 = 0 "standard: qual treshold = 30, amount of allowed bad scores = 33% of first half" count_of_first50 = self._illumina_filtering.check_qual(quality_list, 50, 30, 17) self.assertEqual(count_of_first50, 0) "higher qual treshold = 40, huge amount of allowed bad scores = 100%" count_of_first50 = self._illumina_filtering.check_qual(quality_list, 50, 40, 101) self.assertEqual(count_of_first50, 0) "higher qual treshold = 40, low amount of allowed bad scores = 4" count_of_first50 = self._illumina_filtering.check_qual(quality_list, 50, 40, 4) self.assertEqual(count_of_first50, 1) break
def trim_by_quality(self, infile=None, format='sanger', wsize=1, wstep=1, trim_ends='53', agg_action='min', exc_count=0, score_comp='>=', qual_score=0, filter_first50=False, filter_Ns=False,filter_Nx=0, failed_fastq=False, length=0, trim=0, clip=0, keep_zero_length=False): #format window_size = wsize window_step = wstep #trim_ends aggregation_action = agg_action exclude_count = exc_count score_comparison = score_comp quality_score = qual_score filter_length = length trim_length = trim clip_length = clip if not infile: sys.exit( "illumina_fastq_trimmer: Need to specify an input file" ) if window_size < 1: sys.exit( 'illumina_fastq_trimmer: You must specify a strictly positive window size' ) if window_step < 1: sys.exit( 'illumina_fastq_trimmer: You must specify a strictly positive step size' ) print("\nRunning illumina Filtering") in_filepath = os.path.join(self.indir,infile) try: filebase = infile.split('/')[1].split('.')[0] except: filebase = infile.split('.')[0] out_filename = filebase+".filtered.fastq" out_filepath = os.path.join(self.outdir, out_filename) #determine an exhaustive list of window indexes that can be excluded from aggregation exclude_window_indexes = [] last_exclude_indexes = [] for exclude_count in range( min( exclude_count, window_size ) ): if last_exclude_indexes: new_exclude_indexes = [] for exclude_list in last_exclude_indexes: for window_index in range( window_size ): if window_index not in exclude_list: new_exclude = sorted( exclude_list + [ window_index ] ) if new_exclude not in exclude_window_indexes + new_exclude_indexes: new_exclude_indexes.append( new_exclude ) exclude_window_indexes += new_exclude_indexes last_exclude_indexes = new_exclude_indexes else: for window_index in range( window_size ): last_exclude_indexes.append( [ window_index ] ) exclude_window_indexes = list( last_exclude_indexes ) out = fastqWriter( open( out_filepath, 'wb' ), format = format ) action = ACTION_METHODS[ aggregation_action ] if failed_fastq: fail = fastqWriter( open( out_filepath+'.failed', 'wb' ), format = format ) num_reads = None num_reads_excluded = 0 count_of_unchaste = 0 count_of_trimmed = 0 count_of_first50 = 0 count_of_Ns = 0 if self.runobj.compressed: import gzip try: logger.info( "illumina_filtering: opening compressed file: "+in_filepath) fp = gzip.open( in_filepath ) except: logger.info( "illumina_filtering: opening uncompressed file: "+in_filepath) fp = open( in_filepath ) else: logger.info( "illumina_filtering: opening uncompressed file: "+in_filepath) fp = open( in_filepath ) for num_reads, fastq_read in enumerate( fastqReader( fp, format = format ) ): ############################################################################################ # Put chastity code here #print(fastq_read.identifier) seq = fastq_read.get_sequence() desc_items = fastq_read.identifier.split(':') if desc_items[7] == 'Y': count_of_unchaste += 1 #print('failed chastity') if failed_fastq: fail.write( fastq_read ) continue # Filter reads with ambiguous bases if filter_Ns: countN = seq.count('N') if countN > 1 or (countN == 1 and seq[filter_Nx-1:filter_Nx] != 'N'): #print('failed Ns',infile) count_of_Ns += 1 if failed_fastq: fail.write( fastq_read ) continue # Filter reads below first 50 base quality if filter_first50: first50 = 50 first50_maxQ = 30 first50_maxQ_count = 34 quals = fastq_read.get_decimal_quality_scores()[:first50] count_lt30 = 0 for q in quals: if q < first50_maxQ: count_lt30 += 1 if count_lt30 >= first50_maxQ_count: #print('failed first50') if failed_fastq: fail.write( fastq_read ) count_of_first50 += 1 continue ##### END CHASTITY ##################### ############################################################################################ ##### START Btails CODE ################ quality_list = fastq_read.get_decimal_quality_scores() for trim_end in trim_ends: if trim_end == '5': lwindow_position = 0 #left position of window while True: if lwindow_position >= len( quality_list ): fastq_read.sequence = '' fastq_read.quality = '' break if self.exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + window_size ], score_comparison, quality_score, exclude_window_indexes ): fastq_read = fastq_read.slice( lwindow_position, None ) break lwindow_position += window_step else: rwindow_position = len( quality_list ) #right position of window while True: lwindow_position = rwindow_position - window_size #left position of window if rwindow_position <= 0 or lwindow_position < 0: fastq_read.sequence = '' fastq_read.quality = '' break if self.exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], score_comparison, quality_score, exclude_window_indexes ): fastq_read = fastq_read.slice( None, rwindow_position ) break rwindow_position -= window_step ######## END Btails CODE ############################### ############################################################################################ # put length/trim/clip code here quality_list = fastq_read.get_decimal_quality_scores() if filter_length: if len(quality_list) < filter_length: print('failed length') if failed_fastq: fail.write( fastq_read ) continue # Trim initial bases -- remove first 10 bases from read 2 if clip_length: # remove from the front: fastq_read = fastq_read.slice( clip_length, None ) count_of_trimmed += 1 # Trim to max length -- read 2 trim to 90. if trim_length: if len(quality_list) > trim_length: # remove from the end: fastq_read = fastq_read.slice( None, len(fastq_read.get_sequence()) - trim_length ) count_of_trimmed += 1 if keep_zero_length or len( fastq_read ): out.write( fastq_read ) else: num_reads_excluded += 1 out.close() if failed_fastq: fail.close() print("file:",infile) print('count_of_trimmed (for length):', count_of_trimmed) print('count_of_first50 (avg first50 quals < 34):', count_of_first50) print("count_of_unchaste ('Y' in id):", count_of_unchaste) print('count_of_Ns (reads with N):', count_of_Ns) if num_reads is None: print("No valid FASTQ reads could be processed.") else: print("%i FASTQ reads were processed." % ( num_reads + 1 )) if num_reads_excluded: print("%i reads of zero length were excluded from the output." % num_reads_excluded) return out_filename
def trim_by_quality(self, infile=None, format='sanger', wsize=1, wstep=1, trim_ends='53', agg_action='min', exc_count=0, score_comp='>=', qual_score=0, filter_first50=False, filter_Ns=False, filter_Nx=0, failed_fastq=False, length=0, trim=0, clip=0, keep_zero_length=False): #format window_size = wsize window_step = wstep #trim_ends aggregation_action = agg_action exclude_count = exc_count score_comparison = score_comp quality_score = qual_score filter_length = length trim_length = trim clip_length = clip if not infile: sys.exit("illumina_fastq_trimmer: Need to specify an input file") if window_size < 1: sys.exit( 'illumina_fastq_trimmer: You must specify a strictly positive window size' ) if window_step < 1: sys.exit( 'illumina_fastq_trimmer: You must specify a strictly positive step size' ) print("\nRunning illumina Filtering") in_filepath = os.path.join(self.indir, infile) try: filebase = infile.split('/')[1].split('.')[0] except: filebase = infile.split('.')[0] out_filename = filebase + ".filtered.fastq" out_filepath = os.path.join(self.outdir, out_filename) #determine an exhaustive list of window indexes that can be excluded from aggregation exclude_window_indexes = [] last_exclude_indexes = [] for exclude_count in range(min(exclude_count, window_size)): if last_exclude_indexes: new_exclude_indexes = [] for exclude_list in last_exclude_indexes: for window_index in range(window_size): if window_index not in exclude_list: new_exclude = sorted(exclude_list + [window_index]) if new_exclude not in exclude_window_indexes + new_exclude_indexes: new_exclude_indexes.append(new_exclude) exclude_window_indexes += new_exclude_indexes last_exclude_indexes = new_exclude_indexes else: for window_index in range(window_size): last_exclude_indexes.append([window_index]) exclude_window_indexes = list(last_exclude_indexes) out = fastqWriter(open(out_filepath, 'wb'), format=format) action = ACTION_METHODS[aggregation_action] if failed_fastq: fail = fastqWriter(open(out_filepath + '.failed', 'wb'), format=format) num_reads = None num_reads_excluded = 0 count_of_unchaste = 0 count_of_trimmed = 0 count_of_first50 = 0 count_of_Ns = 0 if self.runobj.compressed: import gzip try: logger.info("illumina_filtering: opening compressed file: " + in_filepath) fp = gzip.open(in_filepath) except: logger.info("illumina_filtering: opening uncompressed file: " + in_filepath) fp = open(in_filepath) else: logger.info("illumina_filtering: opening uncompressed file: " + in_filepath) fp = open(in_filepath) for num_reads, fastq_read in enumerate(fastqReader(fp, format=format)): ############################################################################################ # Put chastity code here #print(fastq_read.identifier) seq = fastq_read.get_sequence() desc_items = fastq_read.identifier.split(':') if desc_items[7] == 'Y': count_of_unchaste += 1 #print('failed chastity') if failed_fastq: fail.write(fastq_read) continue # Filter reads with ambiguous bases if filter_Ns: countN = seq.count('N') if countN > 1 or (countN == 1 and seq[filter_Nx - 1:filter_Nx] != 'N'): #print('failed Ns',infile) count_of_Ns += 1 if failed_fastq: fail.write(fastq_read) continue # Filter reads below first 50 base quality if filter_first50: first50 = 50 first50_maxQ = 30 first50_maxQ_count = 34 quals = fastq_read.get_decimal_quality_scores()[:first50] count_lt30 = 0 for q in quals: if q < first50_maxQ: count_lt30 += 1 if count_lt30 >= first50_maxQ_count: #print('failed first50') if failed_fastq: fail.write(fastq_read) count_of_first50 += 1 continue ##### END CHASTITY ##################### ############################################################################################ ##### START Btails CODE ################ quality_list = fastq_read.get_decimal_quality_scores() for trim_end in trim_ends: if trim_end == '5': lwindow_position = 0 #left position of window while True: if lwindow_position >= len(quality_list): fastq_read.sequence = '' fastq_read.quality = '' break if self.exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + window_size], score_comparison, quality_score, exclude_window_indexes): fastq_read = fastq_read.slice( lwindow_position, None) break lwindow_position += window_step else: rwindow_position = len( quality_list) #right position of window while True: lwindow_position = rwindow_position - window_size #left position of window if rwindow_position <= 0 or lwindow_position < 0: fastq_read.sequence = '' fastq_read.quality = '' break if self.exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position], score_comparison, quality_score, exclude_window_indexes): fastq_read = fastq_read.slice( None, rwindow_position) break rwindow_position -= window_step ######## END Btails CODE ############################### ############################################################################################ # put length/trim/clip code here quality_list = fastq_read.get_decimal_quality_scores() if filter_length: if len(quality_list) < filter_length: print('failed length') if failed_fastq: fail.write(fastq_read) continue # Trim initial bases -- remove first 10 bases from read 2 if clip_length: # remove from the front: fastq_read = fastq_read.slice(clip_length, None) count_of_trimmed += 1 # Trim to max length -- read 2 trim to 90. if trim_length: if len(quality_list) > trim_length: # remove from the end: fastq_read = fastq_read.slice( None, len(fastq_read.get_sequence()) - trim_length) count_of_trimmed += 1 if keep_zero_length or len(fastq_read): out.write(fastq_read) else: num_reads_excluded += 1 out.close() if failed_fastq: fail.close() print("file:", infile) print('count_of_trimmed (for length):', count_of_trimmed) print('count_of_first50 (avg first50 quals < 34):', count_of_first50) print("count_of_unchaste ('Y' in id):", count_of_unchaste) print('count_of_Ns (reads with N):', count_of_Ns) if num_reads is None: print("No valid FASTQ reads could be processed.") else: print("%i FASTQ reads were processed." % (num_reads + 1)) if num_reads_excluded: print("%i reads of zero length were excluded from the output." % num_reads_excluded) return out_filename
def test_03_check_chastity(self): self.assertEqual(self._illumina_filtering.count_of_unchaste, 0) for num_reads, fastq_read in enumerate(fastqReader(self._fp, format = self._format)): desc_items = fastq_read.identifier.split(':') self._illumina_filtering.check_chastity(desc_items) self.assertEqual(self._illumina_filtering.count_of_unchaste, 1)