示例#1
0
    def test_nucl_per_position():
        'It calculates the frec of each nucleotide per position'
        seqs = []
        seqs.append(SeqWithQuality(Seq('ACTG')))
        seqs.append(SeqWithQuality(Seq('ACTG')))
        seqs.append(SeqWithQuality(Seq('CATG')))
        seqs.append(SeqWithQuality(Seq('CATGZ')))
        seqs.append(SeqWithQuality(Seq('ACTT')))
        seqs.append(SeqWithQuality(Seq('ACTTZT')))

        fhand = NamedTemporaryFile(suffix='.svg')
        stats = create_nucleotide_freq_histogram(seqs, fhand, title='test')
        assert stats == {
              'A': [0.66666666666666663, 0.33333333333333331, 0.0, 0.0, 0, 0.0],
              'C': [0.33333333333333331, 0.66666666666666663, 0.0, 0.0, 0, 0.0],
              'T': [0.0, 0.0, 1.0, 0.33333333333333331, 0, 1.0],
              'G': [0.0, 0.0, 0.0, 0.66666666666666663, 0, 0.0]}
        fhand.flush()
        svg = open(fhand.name, 'r').read()
        assert '<!-- Created with matplotlib (http://matplotlib' in svg
示例#2
0
    def _do_seq_distrib_for_pair(self, pair):
        'It does the distribution for a pair of cleaned and raw seqs'

        get_stats_dir = lambda seq_type: os.path.join(self._get_project_path(),
                              BACKBONE_DIRECTORIES['%s_reads_stats' % seq_type])

        #the statistics for both clean and raw sequences
        lengths = {}
        quals = {}
        for seq_type in ('raw', 'cleaned'):
            stats_dir = get_stats_dir(seq_type)

            stats_fpath = os.path.join(stats_dir,
                                       BACKBONE_BASENAMES['statistics_file'])
            stats_fhand = open(stats_fpath, 'a')

            if seq_type in pair:
                fpath = pair[seq_type].last_version
                basename = pair[seq_type].basename



                # nucleotide freq per position
                out_fpath = os.path.join(stats_dir, basename + '.freq_position')
                if not os.path.exists(out_fpath):
                    plot_fpath = out_fpath  + '.' + PLOT_FILE_FORMAT
                    seqs = seqs_in_file(open(fpath))
                    create_nucleotide_freq_histogram(seqs,
                                                    fhand=open(plot_fpath, 'w'))

                #the names for the output files
                out_fpath = os.path.join(stats_dir, basename + '.length')
                plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT
                distrib_fpath = out_fpath + '.dat'
                if os.path.exists(plot_fpath):
                    continue

                lengths_, quals_ = self._get_lengths_quals_from_file(fpath)
                lengths[seq_type] = lengths_
                quals[seq_type] = quals_

                #the distributions for the lengths
                distrib  = lengths_.calculate_distribution()
                lengths_.draw_distribution(distrib, labels=PLOT_LABELS['seq_length'],
                                           distrib_fhand=open(distrib_fpath, 'w'),
                                           plot_fhand=open(plot_fpath, 'w'))

                #the distributions for the quals
                out_fpath = os.path.join(stats_dir, basename + '.qual')
                plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT
                distrib_fpath = out_fpath + '.dat'

                if quals_.count != 0:
                    distrib  = quals_.calculate_distribution()
                    quals_.draw_distribution(distrib,
                                             labels=PLOT_LABELS['seq_qual'],
                                             plot_fhand=open(plot_fpath, 'w'),
                                         distrib_fhand=open(distrib_fpath, 'w'))


                #the statistics for the statistics file
                self._write_statistics(stats_fhand, fpath, lengths_, quals_)

        #the statistics for the differences
        if 'raw' in pair and 'cleaned' in pair:
            fpath = pair['cleaned'].last_version
            basename = pair['cleaned'].basename

            #the names for the output files
            stats_dir = get_stats_dir('cleaned')
            out_fpath = os.path.join(stats_dir, basename + '.length.diff')
            plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT
            distrib_fpath = out_fpath + '.dat'

            if not os.path.exists(plot_fpath) and lengths:
                #the distributions for the lengths
                lengths = lengths['raw'], lengths['cleaned']
                self._do_diff_distrib_for_numbers(lengths,
                                          plot_fhand= open(plot_fpath, 'w'),
                                    distrib_fhand= open(distrib_fpath, 'w'),
                                                  dist_type='seq_length')
                del lengths

                #the distributions for the quals
                out_fpath = os.path.join(stats_dir, basename + '.qual.diff')
                plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT
                distrib_fpath = out_fpath + '.dat'

                quals = quals['raw'], quals['cleaned']
                if quals[0].count != 0 and quals[1].count != 0:
                    self._do_diff_distrib_for_numbers(quals,
                                          plot_fhand= open(plot_fpath, 'w'),
                                    distrib_fhand= open(distrib_fpath, 'w'),
                                                      dist_type='seq_qual')
                del quals

        for seq_type in ('raw', 'cleaned'):
            if seq_type in pair:
                stats_dir = get_stats_dir(seq_type)
                fpath = pair[seq_type].last_version
                basename = pair[seq_type].basename

                #the names for the output files
                out_fpath = os.path.join(stats_dir, basename + '.qual.boxplot')
                plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT
                distrib_fpath = out_fpath + '.dat'

                if os.path.exists(plot_fpath):
                    continue

                quals_ = self._get_quals_by_length_from_file(fpath)

                if quals_ and quals_[0]:
                    #boxplot
                    draw_boxplot(quals_, fhand=open(plot_fpath, 'w'),
                                 title=PLOT_LABELS['qual_boxplot']['title'],
                                 xlabel=PLOT_LABELS['qual_boxplot']['xlabel'],
                                 ylabel=PLOT_LABELS['qual_boxplot']['ylabel'],
                                 stats_fhand=open(distrib_fpath, 'w'),
                                 max_plotted_boxes=30)
                del quals_
    def _do_stats_for_file(self, path, stats_dir):
        'It calculates the stats for a file os seqs'
        stats_fpath = os.path.join(stats_dir,
                                       BACKBONE_BASENAMES['statistics_file'])
        stats_fhand = open(stats_fpath, 'a')

        fpath = path.last_version
        basename = path.basename
        #output_files
        freq_pos_out_fpath = os.path.join(stats_dir,
                                          basename + '.freq_position')
        length_out_fpath = os.path.join(stats_dir, basename + '.length')
        qual_out_fpath = os.path.join(stats_dir, basename + '.qual')
        qual_boxplot_out_fpath = os.path.join(stats_dir,
                                              basename + '.qual.boxplot')

        # nucleotide freq per position
        if not os.path.exists(freq_pos_out_fpath):
            plot_fpath = freq_pos_out_fpath + '.' + PLOT_FILE_FORMAT
            seqs = seqs_in_file(open(fpath))
            create_nucleotide_freq_histogram(seqs, fhand=open(plot_fpath, 'w'))

        #Extract lengths and quals
        if (not os.path.exists(length_out_fpath) or
            not  os.path.exists(qual_out_fpath)):
            lengths_, quals_ = self._get_lengths_quals_from_file(fpath)

        #the distributions for the lengths
        if not os.path.exists(length_out_fpath):
            plot_fpath = length_out_fpath + '.' + PLOT_FILE_FORMAT
            distrib_fpath = length_out_fpath + '.dat'
            distrib = lengths_.calculate_distribution()
            lengths_.draw_distribution(distrib,
                                 labels=PLOT_LABELS['seq_length'],
                                 distrib_fhand=open(distrib_fpath, 'w'),
                                 plot_fhand=open(plot_fpath, 'w'))

        #the distributions for the quals
        if not  os.path.exists(qual_out_fpath) or quals_.count != 0:
            plot_fpath = qual_out_fpath + '.' + PLOT_FILE_FORMAT
            distrib_fpath = qual_out_fpath + '.dat'

            distrib = quals_.calculate_distribution()
            quals_.draw_distribution(distrib,
                                     labels=PLOT_LABELS['seq_qual'],
                                     plot_fhand=open(plot_fpath, 'w'),
                                 distrib_fhand=open(distrib_fpath, 'w'))

        # qual boxplot
        if not os.path.exists(qual_boxplot_out_fpath):
            quals = self._get_quals_by_length_from_file(fpath)
            if (quals and quals[0]):
                plot_fpath = qual_boxplot_out_fpath + '.' + PLOT_FILE_FORMAT
                distrib_fpath = qual_boxplot_out_fpath + '.dat'
                #boxplot
                draw_boxplot(quals, fhand=open(plot_fpath, 'w'),
                             title=PLOT_LABELS['qual_boxplot']['title'],
                             xlabel=PLOT_LABELS['qual_boxplot']['xlabel'],
                             ylabel=PLOT_LABELS['qual_boxplot']['ylabel'],
                             stats_fhand=open(distrib_fpath, 'w'),
                             max_plotted_boxes=30)

        #the statistics for the statistics file
        self._write_statistics(stats_fhand, fpath, lengths_, quals_)