Exemplo n.º 1
0
    def test_04_check_qual(self):
        count_of_first50 = 0
        self._fp.seek( 0 )

        for num_reads, fastq_read in enumerate(fastqReader(self._fp, format = self._format)):
            quality_list = fastq_read.get_decimal_quality_scores()
            self._illumina_filtering.quality_list = [31, 31, 31, 37, 35, 35, 35, 35, 37, 39, 37, 35, 39, 40, 38, 38, 40, 40, 40, 38, 38, 40, 38, 36, 39, 40, 38, 41, 39, 39, 40, 40, 41, 41, 41, 38, 38, 38, 38, 40, 41, 41, 40, 40, 40, 41, 40, 37, 39, 37, 35, 28, 33, 35, 35, 20, 22, 20, 22, 27, 30, 34, 34, 35, 26, 33, 33, 33, 29, 34, 31, 34, 35, 35, 34, 34, 34, 33, 35, 29, 33, 33, 29, 35, 35, 35, 35, 33, 29, 33, 27, 31, 34, 34, 35, 32, 31, 31, 31, 34, 34]
            self._illumina_filtering.count_of_first50 = 0
            "standard: qual treshold = 30, amount of allowed bad scores = 33% of first half"
            count_of_first50 = self._illumina_filtering.check_qual(quality_list, 50, 30, 17)
            self.assertEqual(count_of_first50, 0)           
            "higher qual treshold = 40, huge amount of allowed bad scores = 100%"
            count_of_first50 = self._illumina_filtering.check_qual(quality_list, 50, 40, 101)
            self.assertEqual(count_of_first50, 0)           
            "higher qual treshold = 40, low amount of allowed bad scores = 4"
            count_of_first50 = self._illumina_filtering.check_qual(quality_list, 50, 40, 4)
            self.assertEqual(count_of_first50, 1)           
            break
    def trim_by_quality(self, infile=None,
                        format='sanger',        wsize=1,        wstep=1,            trim_ends='53',
                        agg_action='min',       exc_count=0,    score_comp='>=',    qual_score=0,
                        filter_first50=False,   filter_Ns=False,filter_Nx=0,        failed_fastq=False,
                        length=0,               trim=0,         clip=0,             keep_zero_length=False):
        #format
        window_size         = wsize
        window_step         = wstep
        #trim_ends
        aggregation_action  = agg_action
        exclude_count       = exc_count
        score_comparison    = score_comp
        quality_score       = qual_score
        filter_length       = length
        trim_length         = trim
        clip_length         = clip
        if not infile:
            sys.exit( "illumina_fastq_trimmer: Need to specify an input file" )

        if window_size < 1:
            sys.exit( 'illumina_fastq_trimmer: You must specify a strictly positive window size' )

        if window_step < 1:
            sys.exit( 'illumina_fastq_trimmer: You must specify a strictly positive step size' )

        print("\nRunning illumina Filtering")

        in_filepath = os.path.join(self.indir,infile)
        try:
            filebase    = infile.split('/')[1].split('.')[0]
        except:
            filebase    = infile.split('.')[0]

        out_filename    = filebase+".filtered.fastq"
        out_filepath    = os.path.join(self.outdir, out_filename)




        #determine an exhaustive list of window indexes that can be excluded from aggregation
        exclude_window_indexes = []
        last_exclude_indexes = []
        for exclude_count in range( min( exclude_count, window_size ) ):
            if last_exclude_indexes:
                new_exclude_indexes = []
                for exclude_list in last_exclude_indexes:
                    for window_index in range( window_size ):
                        if window_index not in exclude_list:
                            new_exclude = sorted( exclude_list + [ window_index ] )
                            if new_exclude not in exclude_window_indexes + new_exclude_indexes:
                                new_exclude_indexes.append( new_exclude )
                exclude_window_indexes += new_exclude_indexes
                last_exclude_indexes = new_exclude_indexes
            else:
                for window_index in range( window_size ):
                    last_exclude_indexes.append( [ window_index ] )
                exclude_window_indexes = list( last_exclude_indexes )
        out = fastqWriter( open( out_filepath, 'wb' ), format = format )
        action = ACTION_METHODS[ aggregation_action ]
        if failed_fastq:
            fail = fastqWriter( open( out_filepath+'.failed', 'wb' ), format = format )
        num_reads = None
        num_reads_excluded = 0
        count_of_unchaste = 0
        count_of_trimmed  = 0
        count_of_first50  = 0
        count_of_Ns  = 0
        if self.runobj.compressed:
            import gzip
            try:
                logger.info( "illumina_filtering: opening compressed file: "+in_filepath)
                fp = gzip.open( in_filepath )
            except:
                logger.info( "illumina_filtering: opening uncompressed file: "+in_filepath)
                fp = open( in_filepath )
        else:
            logger.info(  "illumina_filtering: opening uncompressed file: "+in_filepath)
            fp = open( in_filepath )
        for num_reads, fastq_read in enumerate( fastqReader( fp, format = format ) ):
            ############################################################################################
            # Put chastity code here
            #print(fastq_read.identifier)
            seq = fastq_read.get_sequence()

            desc_items = fastq_read.identifier.split(':')

            if desc_items[7] == 'Y':
                count_of_unchaste += 1
                #print('failed chastity')
                if failed_fastq:
                    fail.write( fastq_read )
                continue

            # Filter reads with ambiguous bases
            if filter_Ns:
                countN = seq.count('N')
                if countN > 1 or (countN == 1 and seq[filter_Nx-1:filter_Nx] != 'N'):
                    #print('failed Ns',infile)
                    count_of_Ns += 1
                    if failed_fastq:
                        fail.write( fastq_read )
                    continue



            # Filter reads below first 50 base quality
            if filter_first50:
                first50 = 50
                first50_maxQ = 30
                first50_maxQ_count = 34

                quals = fastq_read.get_decimal_quality_scores()[:first50]
                count_lt30 = 0

                for q in quals:
                    if q < first50_maxQ:
                        count_lt30 += 1
                if count_lt30 >= first50_maxQ_count:
                    #print('failed first50')
                    if failed_fastq:
                        fail.write( fastq_read )
                    count_of_first50 += 1
                    continue

            ##### END CHASTITY #####################
            ############################################################################################
            ##### START Btails CODE ################
            quality_list = fastq_read.get_decimal_quality_scores()

            for trim_end in trim_ends:


                if trim_end == '5':
                    lwindow_position = 0 #left position of window
                    while True:
                        if lwindow_position >= len( quality_list ):
                            fastq_read.sequence = ''
                            fastq_read.quality = ''
                            break
                        if self.exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + window_size ], score_comparison, quality_score, exclude_window_indexes ):
                            fastq_read = fastq_read.slice( lwindow_position, None )
                            break
                        lwindow_position += window_step
                else:
                    rwindow_position = len( quality_list ) #right position of window
                    while True:
                        lwindow_position = rwindow_position - window_size #left position of window
                        if rwindow_position <= 0 or lwindow_position < 0:
                            fastq_read.sequence = ''
                            fastq_read.quality = ''
                            break
                        if self.exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], score_comparison, quality_score, exclude_window_indexes ):
                            fastq_read = fastq_read.slice( None, rwindow_position )
                            break
                        rwindow_position -= window_step

            ######## END Btails CODE ###############################
            ############################################################################################
            # put  length/trim/clip code here
            quality_list = fastq_read.get_decimal_quality_scores()

            if filter_length:
                if len(quality_list) < filter_length:
                    print('failed length')
                    if failed_fastq:
                        fail.write( fastq_read )
                    continue

            # Trim initial bases -- remove first 10 bases from read 2
            if clip_length:
                # remove from the front:
                fastq_read = fastq_read.slice( clip_length, None )
                count_of_trimmed += 1

            # Trim to max length -- read 2 trim to 90.
            if trim_length:
                if len(quality_list) > trim_length:
                    # remove from the end:
                    fastq_read = fastq_read.slice( None, len(fastq_read.get_sequence()) - trim_length )
                    count_of_trimmed += 1


            if keep_zero_length or len( fastq_read ):
                out.write( fastq_read )
            else:
                num_reads_excluded += 1
        out.close()
        if failed_fastq:
            fail.close()
        print("file:",infile)
        print('count_of_trimmed             (for length):', count_of_trimmed)
        print('count_of_first50 (avg first50 quals < 34):', count_of_first50)
        print("count_of_unchaste             ('Y' in id):", count_of_unchaste)
        print('count_of_Ns                (reads with N):', count_of_Ns)
        if num_reads is None:
            print("No valid FASTQ reads could be processed.")
        else:
            print("%i FASTQ reads were processed." % ( num_reads + 1 ))
        if num_reads_excluded:
            print("%i reads of zero length were excluded from the output." % num_reads_excluded)

        return out_filename
Exemplo n.º 3
0
    def trim_by_quality(self,
                        infile=None,
                        format='sanger',
                        wsize=1,
                        wstep=1,
                        trim_ends='53',
                        agg_action='min',
                        exc_count=0,
                        score_comp='>=',
                        qual_score=0,
                        filter_first50=False,
                        filter_Ns=False,
                        filter_Nx=0,
                        failed_fastq=False,
                        length=0,
                        trim=0,
                        clip=0,
                        keep_zero_length=False):
        #format
        window_size = wsize
        window_step = wstep
        #trim_ends
        aggregation_action = agg_action
        exclude_count = exc_count
        score_comparison = score_comp
        quality_score = qual_score
        filter_length = length
        trim_length = trim
        clip_length = clip
        if not infile:
            sys.exit("illumina_fastq_trimmer: Need to specify an input file")

        if window_size < 1:
            sys.exit(
                'illumina_fastq_trimmer: You must specify a strictly positive window size'
            )

        if window_step < 1:
            sys.exit(
                'illumina_fastq_trimmer: You must specify a strictly positive step size'
            )

        print("\nRunning illumina Filtering")

        in_filepath = os.path.join(self.indir, infile)
        try:
            filebase = infile.split('/')[1].split('.')[0]
        except:
            filebase = infile.split('.')[0]

        out_filename = filebase + ".filtered.fastq"
        out_filepath = os.path.join(self.outdir, out_filename)

        #determine an exhaustive list of window indexes that can be excluded from aggregation
        exclude_window_indexes = []
        last_exclude_indexes = []
        for exclude_count in range(min(exclude_count, window_size)):
            if last_exclude_indexes:
                new_exclude_indexes = []
                for exclude_list in last_exclude_indexes:
                    for window_index in range(window_size):
                        if window_index not in exclude_list:
                            new_exclude = sorted(exclude_list + [window_index])
                            if new_exclude not in exclude_window_indexes + new_exclude_indexes:
                                new_exclude_indexes.append(new_exclude)
                exclude_window_indexes += new_exclude_indexes
                last_exclude_indexes = new_exclude_indexes
            else:
                for window_index in range(window_size):
                    last_exclude_indexes.append([window_index])
                exclude_window_indexes = list(last_exclude_indexes)
        out = fastqWriter(open(out_filepath, 'wb'), format=format)
        action = ACTION_METHODS[aggregation_action]
        if failed_fastq:
            fail = fastqWriter(open(out_filepath + '.failed', 'wb'),
                               format=format)
        num_reads = None
        num_reads_excluded = 0
        count_of_unchaste = 0
        count_of_trimmed = 0
        count_of_first50 = 0
        count_of_Ns = 0
        if self.runobj.compressed:
            import gzip
            try:
                logger.info("illumina_filtering: opening compressed file: " +
                            in_filepath)
                fp = gzip.open(in_filepath)
            except:
                logger.info("illumina_filtering: opening uncompressed file: " +
                            in_filepath)
                fp = open(in_filepath)
        else:
            logger.info("illumina_filtering: opening uncompressed file: " +
                        in_filepath)
            fp = open(in_filepath)
        for num_reads, fastq_read in enumerate(fastqReader(fp, format=format)):
            ############################################################################################
            # Put chastity code here
            #print(fastq_read.identifier)
            seq = fastq_read.get_sequence()

            desc_items = fastq_read.identifier.split(':')

            if desc_items[7] == 'Y':
                count_of_unchaste += 1
                #print('failed chastity')
                if failed_fastq:
                    fail.write(fastq_read)
                continue

            # Filter reads with ambiguous bases
            if filter_Ns:
                countN = seq.count('N')
                if countN > 1 or (countN == 1
                                  and seq[filter_Nx - 1:filter_Nx] != 'N'):
                    #print('failed Ns',infile)
                    count_of_Ns += 1
                    if failed_fastq:
                        fail.write(fastq_read)
                    continue

            # Filter reads below first 50 base quality
            if filter_first50:
                first50 = 50
                first50_maxQ = 30
                first50_maxQ_count = 34

                quals = fastq_read.get_decimal_quality_scores()[:first50]
                count_lt30 = 0

                for q in quals:
                    if q < first50_maxQ:
                        count_lt30 += 1
                if count_lt30 >= first50_maxQ_count:
                    #print('failed first50')
                    if failed_fastq:
                        fail.write(fastq_read)
                    count_of_first50 += 1
                    continue

            ##### END CHASTITY #####################
            ############################################################################################
            ##### START Btails CODE ################
            quality_list = fastq_read.get_decimal_quality_scores()

            for trim_end in trim_ends:

                if trim_end == '5':
                    lwindow_position = 0  #left position of window
                    while True:
                        if lwindow_position >= len(quality_list):
                            fastq_read.sequence = ''
                            fastq_read.quality = ''
                            break
                        if self.exclude_and_compare(
                                action, quality_list[
                                    lwindow_position:lwindow_position +
                                    window_size], score_comparison,
                                quality_score, exclude_window_indexes):
                            fastq_read = fastq_read.slice(
                                lwindow_position, None)
                            break
                        lwindow_position += window_step
                else:
                    rwindow_position = len(
                        quality_list)  #right position of window
                    while True:
                        lwindow_position = rwindow_position - window_size  #left position of window
                        if rwindow_position <= 0 or lwindow_position < 0:
                            fastq_read.sequence = ''
                            fastq_read.quality = ''
                            break
                        if self.exclude_and_compare(
                                action, quality_list[
                                    lwindow_position:rwindow_position],
                                score_comparison, quality_score,
                                exclude_window_indexes):
                            fastq_read = fastq_read.slice(
                                None, rwindow_position)
                            break
                        rwindow_position -= window_step

            ######## END Btails CODE ###############################
            ############################################################################################
            # put  length/trim/clip code here
            quality_list = fastq_read.get_decimal_quality_scores()

            if filter_length:
                if len(quality_list) < filter_length:
                    print('failed length')
                    if failed_fastq:
                        fail.write(fastq_read)
                    continue

            # Trim initial bases -- remove first 10 bases from read 2
            if clip_length:
                # remove from the front:
                fastq_read = fastq_read.slice(clip_length, None)
                count_of_trimmed += 1

            # Trim to max length -- read 2 trim to 90.
            if trim_length:
                if len(quality_list) > trim_length:
                    # remove from the end:
                    fastq_read = fastq_read.slice(
                        None,
                        len(fastq_read.get_sequence()) - trim_length)
                    count_of_trimmed += 1

            if keep_zero_length or len(fastq_read):
                out.write(fastq_read)
            else:
                num_reads_excluded += 1
        out.close()
        if failed_fastq:
            fail.close()
        print("file:", infile)
        print('count_of_trimmed             (for length):', count_of_trimmed)
        print('count_of_first50 (avg first50 quals < 34):', count_of_first50)
        print("count_of_unchaste             ('Y' in id):", count_of_unchaste)
        print('count_of_Ns                (reads with N):', count_of_Ns)
        if num_reads is None:
            print("No valid FASTQ reads could be processed.")
        else:
            print("%i FASTQ reads were processed." % (num_reads + 1))
        if num_reads_excluded:
            print("%i reads of zero length were excluded from the output." %
                  num_reads_excluded)

        return out_filename
Exemplo n.º 4
0
 def test_03_check_chastity(self):
     self.assertEqual(self._illumina_filtering.count_of_unchaste, 0)           
     for num_reads, fastq_read in enumerate(fastqReader(self._fp, format = self._format)):
         desc_items = fastq_read.identifier.split(':')
         self._illumina_filtering.check_chastity(desc_items)
     self.assertEqual(self._illumina_filtering.count_of_unchaste, 1)