def test_boxplot(self): 'It checks the boxplot drawing' #some data mu = 10 sigma = 3 num_values = 1000 lists = [] for index in range(5): values = [random.normalvariate(mu, sigma) for index_ in range(num_values)] lists.append(values) values = [random.uniform(mu+sigma, mu-sigma) for index_ in range(num_values)] lists.append(values) plot_fhand = tempfile.NamedTemporaryFile(suffix='.svg') stats_fhand = StringIO() draw_boxplot(lists, xlabel='distributions', ylabel='distrib', title='boxplot', fhand=plot_fhand, stats_fhand=stats_fhand) assert 'xml' in open(plot_fhand.name).read(10) result = stats_fhand.getvalue() assert '09' in result assert 'median' in result plot_fhand.close() #the not to draw all boxes plot_fhand = tempfile.NamedTemporaryFile(suffix='.svg') draw_boxplot(lists, xlabel='distributions', ylabel='distrib', title='boxplot', fhand=plot_fhand, stats_fhand=stats_fhand, max_plotted_boxes=5) assert 'xml' in open(plot_fhand.name).read(10) plot_fhand.close()
def _do_seq_distrib_for_pair(self, pair): 'It does the distribution for a pair of cleaned and raw seqs' get_stats_dir = lambda seq_type: os.path.join(self._get_project_path(), BACKBONE_DIRECTORIES['%s_reads_stats' % seq_type]) #the statistics for both clean and raw sequences lengths = {} quals = {} for seq_type in ('raw', 'cleaned'): stats_dir = get_stats_dir(seq_type) stats_fpath = os.path.join(stats_dir, BACKBONE_BASENAMES['statistics_file']) stats_fhand = open(stats_fpath, 'a') if seq_type in pair: fpath = pair[seq_type].last_version basename = pair[seq_type].basename # nucleotide freq per position out_fpath = os.path.join(stats_dir, basename + '.freq_position') if not os.path.exists(out_fpath): plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT seqs = seqs_in_file(open(fpath)) create_nucleotide_freq_histogram(seqs, fhand=open(plot_fpath, 'w')) #the names for the output files out_fpath = os.path.join(stats_dir, basename + '.length') plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = out_fpath + '.dat' if os.path.exists(plot_fpath): continue lengths_, quals_ = self._get_lengths_quals_from_file(fpath) lengths[seq_type] = lengths_ quals[seq_type] = quals_ #the distributions for the lengths distrib = lengths_.calculate_distribution() lengths_.draw_distribution(distrib, labels=PLOT_LABELS['seq_length'], distrib_fhand=open(distrib_fpath, 'w'), plot_fhand=open(plot_fpath, 'w')) #the distributions for the quals out_fpath = os.path.join(stats_dir, basename + '.qual') plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = out_fpath + '.dat' if quals_.count != 0: distrib = quals_.calculate_distribution() quals_.draw_distribution(distrib, labels=PLOT_LABELS['seq_qual'], plot_fhand=open(plot_fpath, 'w'), distrib_fhand=open(distrib_fpath, 'w')) #the statistics for the statistics file self._write_statistics(stats_fhand, fpath, lengths_, quals_) #the statistics for the differences if 'raw' in pair and 'cleaned' in pair: fpath = pair['cleaned'].last_version basename = pair['cleaned'].basename #the names for the output files stats_dir = get_stats_dir('cleaned') out_fpath = os.path.join(stats_dir, basename + '.length.diff') plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = out_fpath + '.dat' if not os.path.exists(plot_fpath) and lengths: #the distributions for the lengths lengths = lengths['raw'], lengths['cleaned'] self._do_diff_distrib_for_numbers(lengths, plot_fhand= open(plot_fpath, 'w'), distrib_fhand= open(distrib_fpath, 'w'), dist_type='seq_length') del lengths #the distributions for the quals out_fpath = os.path.join(stats_dir, basename + '.qual.diff') plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = out_fpath + '.dat' quals = quals['raw'], quals['cleaned'] if quals[0].count != 0 and quals[1].count != 0: self._do_diff_distrib_for_numbers(quals, plot_fhand= open(plot_fpath, 'w'), distrib_fhand= open(distrib_fpath, 'w'), dist_type='seq_qual') del quals for seq_type in ('raw', 'cleaned'): if seq_type in pair: stats_dir = get_stats_dir(seq_type) fpath = pair[seq_type].last_version basename = pair[seq_type].basename #the names for the output files out_fpath = os.path.join(stats_dir, basename + '.qual.boxplot') plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = out_fpath + '.dat' if os.path.exists(plot_fpath): continue quals_ = self._get_quals_by_length_from_file(fpath) if quals_ and quals_[0]: #boxplot draw_boxplot(quals_, fhand=open(plot_fpath, 'w'), title=PLOT_LABELS['qual_boxplot']['title'], xlabel=PLOT_LABELS['qual_boxplot']['xlabel'], ylabel=PLOT_LABELS['qual_boxplot']['ylabel'], stats_fhand=open(distrib_fpath, 'w'), max_plotted_boxes=30) del quals_
def _do_stats_for_file(self, path, stats_dir): 'It calculates the stats for a file os seqs' stats_fpath = os.path.join(stats_dir, BACKBONE_BASENAMES['statistics_file']) stats_fhand = open(stats_fpath, 'a') fpath = path.last_version basename = path.basename #output_files freq_pos_out_fpath = os.path.join(stats_dir, basename + '.freq_position') length_out_fpath = os.path.join(stats_dir, basename + '.length') qual_out_fpath = os.path.join(stats_dir, basename + '.qual') qual_boxplot_out_fpath = os.path.join(stats_dir, basename + '.qual.boxplot') # nucleotide freq per position if not os.path.exists(freq_pos_out_fpath): plot_fpath = freq_pos_out_fpath + '.' + PLOT_FILE_FORMAT seqs = seqs_in_file(open(fpath)) create_nucleotide_freq_histogram(seqs, fhand=open(plot_fpath, 'w')) #Extract lengths and quals if (not os.path.exists(length_out_fpath) or not os.path.exists(qual_out_fpath)): lengths_, quals_ = self._get_lengths_quals_from_file(fpath) #the distributions for the lengths if not os.path.exists(length_out_fpath): plot_fpath = length_out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = length_out_fpath + '.dat' distrib = lengths_.calculate_distribution() lengths_.draw_distribution(distrib, labels=PLOT_LABELS['seq_length'], distrib_fhand=open(distrib_fpath, 'w'), plot_fhand=open(plot_fpath, 'w')) #the distributions for the quals if not os.path.exists(qual_out_fpath) or quals_.count != 0: plot_fpath = qual_out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = qual_out_fpath + '.dat' distrib = quals_.calculate_distribution() quals_.draw_distribution(distrib, labels=PLOT_LABELS['seq_qual'], plot_fhand=open(plot_fpath, 'w'), distrib_fhand=open(distrib_fpath, 'w')) # qual boxplot if not os.path.exists(qual_boxplot_out_fpath): quals = self._get_quals_by_length_from_file(fpath) if (quals and quals[0]): plot_fpath = qual_boxplot_out_fpath + '.' + PLOT_FILE_FORMAT distrib_fpath = qual_boxplot_out_fpath + '.dat' #boxplot draw_boxplot(quals, fhand=open(plot_fpath, 'w'), title=PLOT_LABELS['qual_boxplot']['title'], xlabel=PLOT_LABELS['qual_boxplot']['xlabel'], ylabel=PLOT_LABELS['qual_boxplot']['ylabel'], stats_fhand=open(distrib_fpath, 'w'), max_plotted_boxes=30) #the statistics for the statistics file self._write_statistics(stats_fhand, fpath, lengths_, quals_)