예제 #1
0
    def hist_average_quality(self, fontsize=16, bins=None):
        """

        bins is from 0 to 94 
        """

        hq_qv = [
            pylab.mean([ord(X) - 33 for X in read['quality'].decode()])
            for read in self.hq_sequence
        ]
        lq_qv = [
            pylab.mean([ord(X) - 33 for X in read['quality'].decode()])
            for read in self.lq_sequence
        ]

        if bins is None:
            bins = range(0, 94)
        Y1, X = np.histogram(hq_qv, bins=bins)
        Y2, X = np.histogram(lq_qv, bins=bins)
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlim([0.5, 93.5])

        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)

        ax = pylab.twinx()
        N = np.sum(Y1 + Y2)
        ax.plot(X, [N] + list(N - np.cumsum(Y1 + Y2)), "k")
예제 #2
0
    def moving_average(self, n, circular=False):
        """Compute moving average of the genome coverage

        :param n: window's size. Must be odd
        :param bool circular: is the chromosome circular or not

        Store the results in the :attr:`df` attribute (dataframe) with a
        column named *ma*.

        """
        N = len(self.df['cov'])
        assert n < N/2
        from sequana.stats import moving_average

        ret = np.cumsum(np.array(self.df["cov"]), dtype=float)
        ret[n:] = ret[n:] - ret[:-n]
        ma = ret[n - 1:] / n
        mid = int(n / 2)
        self.df["ma"] = pd.Series(ma, index=np.arange(start=mid,
            stop=(len(ma) + mid)))

        if circular:
            # FIXME: shift of +-1 as compared to non circular case...
            # shift the data and compute the moving average
            self.data = list(self.df['cov'].values[N-n:]) +\
                list(self.df['cov'].values) + \
                list(self.df['cov'].values[0:n])
            ma = moving_average(self.data, n)
            self.ma = ma[n//2+1:-n//2]
            self.df["ma"] = pd.Series(self.ma, index=self.df['cov'].index)
예제 #3
0
def N50(data):
    """Return the N50 value given a list of unsorted/sorted contigs

    Once the list of contigs is sorted, the N50 is the contig length for which at
    least half of the nucleotides in the assembly belongs to contigs with the N50
    length or longer.

    """
    data = np.sort(data)
    cdata = np.cumsum(data)
    return data[np.argmax(cdata > cdata[-1] / 2)]
예제 #4
0
def L50(data):
    """Return the smallest number of contigs whose length sum produces N50

    ::

        >>> data =
        >>> L50(data)
        3
    """
    data = np.sort(data)
    cdata = np.cumsum(data)
    pos = np.argmax(cdata > cdata[-1] / 2)
    return len(data) - pos
예제 #5
0
파일: stats.py 프로젝트: sequana/sequana
def moving_average(data, n):
    """Compute moving average

    :param n: window's size (odd or even).

    ::

        >>> from sequana.stats import moving_average as ma
        >>> ma([1,1,1,1,3,3,3,3], 4)
        array([ 1. ,  1.5,  2. ,  2.5,  3. ])

    .. note:: the final vector does not have the same size as the input
        vector.

    """
    ret = np.cumsum(data, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    ma = ret[n - 1:] / n
    return ma
예제 #6
0
def moving_average(data, n):
    """Compute moving average

    :param n: window's size (odd or even).

    ::

        >>> from sequana.stats import moving_average as ma
        >>> ma([1,1,1,1,3,3,3,3], 4)
        array([ 1. ,  1.5,  2. ,  2.5,  3. ])

    .. note:: the final vector does not have the same size as the input
        vector.

    """
    ret = np.cumsum(data, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    ma = ret[n - 1:] / n
    return ma
예제 #7
0
    def plot_all_skews(self, figsize=(10, 12), fontsize=16, alpha=0.5):
        if self._window is None:
            raise AttributeError("Please set a valid window to compute skew")

        # create figure
        # fig, axarr = pylab.subplots(10,1, sharex=True, figsize=figsize)
        fig, axarr = pylab.subplots(9, 1, sharex=True, figsize=figsize)

        main_title = "Window size = %d (%.0f %% of genome )\n\
        GC content = %.0f %%, AT content = %.0f %%, ignored = %.0f %%" \
        % (self._window, self._window*100/self.__len__(),
            self.gc_content()*100, (1-self.gc_content())*100, self._ignored_nuc*100)

        pylab.suptitle(main_title, fontsize=fontsize)

        # GC skew
        axarr[0].set_title("GC skew (blue) - Cumulative sum (red)")
        axarr[0].plot(list(self._GC_skew_slide[0]), 'b-', alpha=alpha)
        axarr[0].set_ylabel("(G -C) / (G + C)")

        axarr[1].plot(list(np.cumsum(self._GC_skew_slide[0])),
                      'r-',
                      alpha=alpha)
        axarr[1].set_ylabel("(G -C) / (G + C)")

        # AT skew
        axarr[2].set_title("AT skew (blue) - Cumulative sum (red)")
        axarr[2].plot(list(self._AT_skew_slide[0]), 'b-', alpha=alpha)
        axarr[2].set_ylabel("(A -T) / (A + T)")

        axarr[3].plot(list(np.cumsum(self._AT_skew_slide[0])),
                      'r-',
                      alpha=alpha)
        axarr[3].set_ylabel("(A -T) / (A + T)", rotation=0)

        # Xn
        axarr[4].set_title("Cumulative RY skew (Purine - Pyrimidine)")
        axarr[4].plot(self._Xn, 'g-', alpha=alpha)
        axarr[4].set_ylabel("(A + G) - (C + T)")

        # Yn
        axarr[5].set_title("Cumulative MK skew (Amino - Keto)")
        axarr[5].plot(self._Yn, 'g-', alpha=alpha)
        axarr[5].set_ylabel("(A + C) - (G + T)")

        # Zn
        axarr[6].set_title(
            "Cumulative H-bond skew (Weak H-bond - Strong H-bond)")
        axarr[6].plot(self._Zn, 'g-', alpha=alpha)
        axarr[6].set_ylabel("(A + T) - (G + C)")

        # GC content
        axarr[7].set_title("GC content")
        axarr[7].plot(list(self._GC_content_slide[0]), 'k-', alpha=alpha)
        axarr[7].set_ylabel("GC")

        # AT content
        axarr[8].set_title("AT content")
        axarr[8].plot(list(self._AT_content_slide[0]), 'k-', alpha=alpha)
        axarr[8].set_ylabel("AT")

        # # FFT
        # axarr[9].set_title("FFT")
        # axarr[9].plot(list(self._c_fft),'g-',alpha=alpha)
        # axarr[9].set_ylabel("FFT")

        fig.tight_layout()
        fig.subplots_adjust(top=0.88)
예제 #8
0
    def _compute_skews(self):
        ### initialisation =  Calculating GC skew and AT skew for first window
        self._init_sliding_window()
        GC_content_slide, GC_skew_slide = self._init_list_results()
        AT_content_slide, AT_skew_slide = self._init_list_results()
        self._init_cumul_nuc()

        c = Counter(self._slide_window)
        dict_counts = {'G': c['G'], 'C': c['C'], 'A': c['A'], 'T': c['T']}
        i = 0

        # GC
        sumGC = float(dict_counts['G'] + dict_counts['C'])
        GC_content_slide[0][i] = sumGC
        if sumGC > 0:
            GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C']) / sumGC
        # AT
        sumAT = float(dict_counts['A'] + dict_counts['T'])
        AT_content_slide[0][i] = sumAT
        if sumAT > 0:
            AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T']) / sumAT

        ### Compute for all genome
        while (self._seq_right):
            out_nuc = self._slide_window.popleft()
            in_nuc = self._seq_right.popleft()
            self._slide_window.append(in_nuc)

            i += 1

            if i % 500000 == 0:
                logger.info("%d / %d" % (i, self.__len__()))
            # if in and out are the same : do nothing, append same result
            if out_nuc != in_nuc:
                # remove out from counters
                if out_nuc in self._dict_nuc:
                    dict_counts[out_nuc] -= 1
                if in_nuc in self._dict_nuc:
                    dict_counts[in_nuc] += 1
                sumGC = float(dict_counts['G'] + dict_counts['C'])
                sumAT = float(dict_counts['A'] + dict_counts['T'])

            # fill results
            # GC
            GC_content_slide[0][i] = sumGC
            if sumGC > 0:
                GC_skew_slide[0][i] = (dict_counts['G'] -
                                       dict_counts['C']) / sumGC
            # AT
            AT_content_slide[0][i] = sumAT
            if sumAT > 0:
                AT_skew_slide[0][i] = (dict_counts['A'] -
                                       dict_counts['T']) / sumAT
            # cumul
            if in_nuc in self._dict_nuc:
                self._cumul[self._dict_nuc[in_nuc]][i + self._window - 1] += 1

        self._GC_content_slide = GC_content_slide / float(self._window)
        self._AT_content_slide = AT_content_slide / float(self._window)
        self._cumul = np.delete(self._cumul,
                                range(self.__len__(), self._cumul.shape[1]), 1)
        self._cumul = np.cumsum(self._cumul, axis=1)

        ### save result for Z curve
        self._Xn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['G']]) -\
         (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['T']]))

        self._Yn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['C']]) -\
         (self._cumul[self._dict_nuc['G']] + self._cumul[self._dict_nuc['T']]))

        self._Zn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['T']]) -\
         (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['G']]))

        self._AT_skew_slide = AT_skew_slide
        self._GC_skew_slide = GC_skew_slide

        ### check proportion of ignored nucleotides
        GC_content_total = (self._cumul[self._dict_nuc['G']][-1] +
                            self._cumul[self._dict_nuc['C']][-1]) / float(
                                self.__len__())
        AT_content_total = (self._cumul[self._dict_nuc['A']][-1] +
                            self._cumul[self._dict_nuc['T']][-1]) / float(
                                self.__len__())
        self._ignored_nuc = 1.0 - GC_content_total - AT_content_total
예제 #9
0
파일: sequence.py 프로젝트: sequana/sequana
    def plot_all_skews(self,figsize=(10, 12), fontsize=16, alpha=0.5):
        if self._window is None:
            raise AttributeError("Please set a valid window to compute skew")

        # create figure
        # fig, axarr = pylab.subplots(10,1, sharex=True, figsize=figsize)
        fig, axarr = pylab.subplots(9,1, sharex=True, figsize=figsize)

        main_title = "Window size = %d (%.0f %% of genome )\n\
        GC content = %.0f %%, AT content = %.0f %%, ignored = %.0f %%" \
        % (self._window, self._window*100/self.__len__(),
            self.gc_content()*100, (1-self.gc_content())*100, self._ignored_nuc*100)

        pylab.suptitle(main_title, fontsize=fontsize)

        # GC skew
        axarr[0].set_title("GC skew (blue) - Cumulative sum (red)")
        axarr[0].plot(list(self._GC_skew_slide[0]),'b-',alpha=alpha)
        axarr[0].set_ylabel("(G -C) / (G + C)")

        axarr[1].plot(list(np.cumsum(self._GC_skew_slide[0])),'r-',alpha=alpha)
        axarr[1].set_ylabel("(G -C) / (G + C)")

        # AT skew
        axarr[2].set_title("AT skew (blue) - Cumulative sum (red)")
        axarr[2].plot(list(self._AT_skew_slide[0]),'b-',alpha=alpha)
        axarr[2].set_ylabel("(A -T) / (A + T)")

        axarr[3].plot(list(np.cumsum(self._AT_skew_slide[0])),'r-',alpha=alpha)
        axarr[3].set_ylabel("(A -T) / (A + T)", rotation=0)

        # Xn
        axarr[4].set_title("Cumulative RY skew (Purine - Pyrimidine)")
        axarr[4].plot(self._Xn,'g-',alpha=alpha)
        axarr[4].set_ylabel("(A + G) - (C + T)")

        # Yn
        axarr[5].set_title("Cumulative MK skew (Amino - Keto)")
        axarr[5].plot(self._Yn,'g-',alpha=alpha)
        axarr[5].set_ylabel("(A + C) - (G + T)")

        # Zn
        axarr[6].set_title("Cumulative H-bond skew (Weak H-bond - Strong H-bond)")
        axarr[6].plot(self._Zn,'g-',alpha=alpha)
        axarr[6].set_ylabel("(A + T) - (G + C)")

        # GC content
        axarr[7].set_title("GC content")
        axarr[7].plot(list(self._GC_content_slide[0]),'k-',alpha=alpha)
        axarr[7].set_ylabel("GC")

        # AT content
        axarr[8].set_title("AT content")
        axarr[8].plot(list(self._AT_content_slide[0]),'k-',alpha=alpha)
        axarr[8].set_ylabel("AT")

        # # FFT
        # axarr[9].set_title("FFT")
        # axarr[9].plot(list(self._c_fft),'g-',alpha=alpha)
        # axarr[9].set_ylabel("FFT")

        fig.tight_layout()
        fig.subplots_adjust(top=0.88)
예제 #10
0
파일: sequence.py 프로젝트: sequana/sequana
    def _compute_skews(self):
        ### initialisation =  Calculating GC skew and AT skew for first window
        self._init_sliding_window()
        GC_content_slide, GC_skew_slide = self._init_list_results()
        AT_content_slide, AT_skew_slide = self._init_list_results()
        self._init_cumul_nuc()

        c = Counter(self._slide_window)
        dict_counts = {'G' : c['G'], 'C' : c['C'], 'A' : c['A'], 'T' : c['T']}
        i = 0

        # GC
        sumGC = float(dict_counts['G'] + dict_counts['C'])
        GC_content_slide[0][i] = sumGC
        if sumGC > 0:
            GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C']) / sumGC
        # AT
        sumAT = float(dict_counts['A'] + dict_counts['T'])
        AT_content_slide[0][i] = sumAT
        if sumAT > 0:
            AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T']) / sumAT

        ### Compute for all genome
        while(self._seq_right):
            out_nuc = self._slide_window.popleft()
            in_nuc = self._seq_right.popleft()
            self._slide_window.append(in_nuc)

            i += 1

            if i % 500000 == 0:
                logger.info("%d / %d" % (i, self.__len__()))
            # if in and out are the same : do nothing, append same result
            if out_nuc != in_nuc:
                # remove out from counters
                if out_nuc in self._dict_nuc:
                    dict_counts[out_nuc] -= 1
                if in_nuc in self._dict_nuc:
                    dict_counts[in_nuc] += 1
                sumGC = float(dict_counts['G'] + dict_counts['C'])
                sumAT = float(dict_counts['A'] + dict_counts['T'])

            # fill results
            # GC
            GC_content_slide[0][i] = sumGC
            if sumGC > 0:
                GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C'])/sumGC
            # AT
            AT_content_slide[0][i] = sumAT
            if sumAT > 0:
                AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T'])/sumAT
            # cumul
            if in_nuc in self._dict_nuc:
                self._cumul[self._dict_nuc[in_nuc]][i+self._window-1] +=1

        self._GC_content_slide = GC_content_slide/float(self._window)
        self._AT_content_slide = AT_content_slide/float(self._window)
        self._cumul = np.delete(self._cumul, range(self.__len__(),self._cumul.shape[1]),1)
        self._cumul = np.cumsum(self._cumul,axis=1)

        ### save result for Z curve
        self._Xn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['G']]) -\
         (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['T']]))

        self._Yn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['C']]) -\
         (self._cumul[self._dict_nuc['G']] + self._cumul[self._dict_nuc['T']]))

        self._Zn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['T']]) -\
         (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['G']]))

        self._AT_skew_slide = AT_skew_slide
        self._GC_skew_slide = GC_skew_slide

        ### check proportion of ignored nucleotides
        GC_content_total = (self._cumul[self._dict_nuc['G']][-1] +
            self._cumul[self._dict_nuc['C']][-1]) / float(self.__len__())
        AT_content_total = (self._cumul[self._dict_nuc['A']][-1] +
            self._cumul[self._dict_nuc['T']][-1]) / float(self.__len__())
        self._ignored_nuc = 1.0 - GC_content_total - AT_content_total