def calc_phredperbase_boxplot(infiles = None, filepattern = '', data_inpath = '', saveprefix = '', png_filename=''): ''' Find the median, upper and lower quartile for the Phred score per base Returns the stats and the counter dictionary. Counter dictionary may become standard way to store mass Phred/seq bases data. ''' from collections import Counter RecCycler = SeqRecCycler(infiles = infiles, filepattern = filepattern, data_inpath = data_inpath) print '\nCalculating Box plot stats of phred scores per base position.\n' # Define vars and outputs numfiles = RecCycler.numfiles toc = time.time() cum_t = 0 counter_list = [0] * 101 for i in range(len(counter_list)): counter_list[i] = Counter() for seqrecgen in RecCycler.seqfilegen: filename = RecCycler.curfilename filenum = RecCycler.curfilenum for rec in seqrecgen: for basenum, phred in enumerate(rec.letter_annotations['phred_quality']): counter_list[basenum][phred] += 1 loop_t = time.time() - toc - cum_t cum_t += loop_t print 'Finished {0} \nfile {1} of {2} after {3}'.format(filename, filenum, numfiles, time.strftime('%H:%M:%S', time.gmtime(loop_t))) # Calculate min, max Q1, Q2, Median and Average stats = getStatsFT(counter_list) total_t = time.time() - toc print 'Processed all files in {0}'.format(time.strftime('%H:%M:%S', time.gmtime(total_t))) pklfilename = data_inpath.split('/')[-1] pklsave(counter_list, '_'.join([pklfilename, saveprefix , 'phredCount'])) np.save( '_'.join([pklfilename, saveprefix , 'phredStats.npy']) , stats) plotFTstats(stats, png_filename) return stats, counter_list
total_t = time.time() - toc print 'Finished in {0}'.format( time.strftime('%H:%M:%S', time.gmtime(total_t))) return TagCounter if __name__ == '__main__': #=========================================================================== ''' RUNS SCRIPT FOR ALLL READS IN LANE 6 ''' #=========================================================================== LANE = '6' # Set paths and file patterns data_inpath = '/space/musselle/datasets/gazellesAndZebras/lane' + LANE + '/L6_phredprop_filtered/' # data_inpath = '/home/musselle/data/lane' + LANE os.chdir(data_inpath) raw_files = glob.glob('*[0-9]-pass.fastq.bgzf') raw_files.sort() TagsCounter = tags_counter(infiles = raw_files, sl=(6,12)) pklsave(TagsCounter, 'L{0}_TagsCount-pass'.format(LANE))