示例#1
0
    def _compute_skews(self):
        ### initialisation =  Calculating GC skew and AT skew for first window
        self._init_sliding_window()
        GC_content_slide, GC_skew_slide = self._init_list_results()
        AT_content_slide, AT_skew_slide = self._init_list_results()
        self._init_cumul_nuc()

        c = Counter(self._slide_window)
        dict_counts = {'G': c['G'], 'C': c['C'], 'A': c['A'], 'T': c['T']}
        i = 0

        # GC
        sumGC = float(dict_counts['G'] + dict_counts['C'])
        GC_content_slide[0][i] = sumGC
        if sumGC > 0:
            GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C']) / sumGC
        # AT
        sumAT = float(dict_counts['A'] + dict_counts['T'])
        AT_content_slide[0][i] = sumAT
        if sumAT > 0:
            AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T']) / sumAT

        ### Compute for all genome
        while (self._seq_right):
            out_nuc = self._slide_window.popleft()
            in_nuc = self._seq_right.popleft()
            self._slide_window.append(in_nuc)

            i += 1

            if i % 500000 == 0:
                logger.info("%d / %d" % (i, self.__len__()))
            # if in and out are the same : do nothing, append same result
            if out_nuc != in_nuc:
                # remove out from counters
                if out_nuc in self._dict_nuc:
                    dict_counts[out_nuc] -= 1
                if in_nuc in self._dict_nuc:
                    dict_counts[in_nuc] += 1
                sumGC = float(dict_counts['G'] + dict_counts['C'])
                sumAT = float(dict_counts['A'] + dict_counts['T'])

            # fill results
            # GC
            GC_content_slide[0][i] = sumGC
            if sumGC > 0:
                GC_skew_slide[0][i] = (dict_counts['G'] -
                                       dict_counts['C']) / sumGC
            # AT
            AT_content_slide[0][i] = sumAT
            if sumAT > 0:
                AT_skew_slide[0][i] = (dict_counts['A'] -
                                       dict_counts['T']) / sumAT
            # cumul
            if in_nuc in self._dict_nuc:
                self._cumul[self._dict_nuc[in_nuc]][i + self._window - 1] += 1

        self._GC_content_slide = GC_content_slide / float(self._window)
        self._AT_content_slide = AT_content_slide / float(self._window)
        self._cumul = np.delete(self._cumul,
                                range(self.__len__(), self._cumul.shape[1]), 1)
        self._cumul = np.cumsum(self._cumul, axis=1)

        ### save result for Z curve
        self._Xn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['G']]) -\
         (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['T']]))

        self._Yn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['C']]) -\
         (self._cumul[self._dict_nuc['G']] + self._cumul[self._dict_nuc['T']]))

        self._Zn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['T']]) -\
         (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['G']]))

        self._AT_skew_slide = AT_skew_slide
        self._GC_skew_slide = GC_skew_slide

        ### check proportion of ignored nucleotides
        GC_content_total = (self._cumul[self._dict_nuc['G']][-1] +
                            self._cumul[self._dict_nuc['C']][-1]) / float(
                                self.__len__())
        AT_content_total = (self._cumul[self._dict_nuc['A']][-1] +
                            self._cumul[self._dict_nuc['T']][-1]) / float(
                                self.__len__())
        self._ignored_nuc = 1.0 - GC_content_total - AT_content_total
示例#2
0
    def _compute_skews(self):
        ### initialisation =  Calculating GC skew and AT skew for first window
        self._init_sliding_window()
        GC_content_slide, GC_skew_slide = self._init_list_results()
        AT_content_slide, AT_skew_slide = self._init_list_results()
        self._init_cumul_nuc()

        c = Counter(self._slide_window)
        dict_counts = {'G' : c['G'], 'C' : c['C'], 'A' : c['A'], 'T' : c['T']}
        i = 0

        # GC
        sumGC = float(dict_counts['G'] + dict_counts['C'])
        GC_content_slide[0][i] = sumGC
        if sumGC > 0:
            GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C']) / sumGC
        # AT
        sumAT = float(dict_counts['A'] + dict_counts['T'])
        AT_content_slide[0][i] = sumAT
        if sumAT > 0:
            AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T']) / sumAT

        ### Compute for all genome
        while(self._seq_right):
            out_nuc = self._slide_window.popleft()
            in_nuc = self._seq_right.popleft()
            self._slide_window.append(in_nuc)

            i += 1

            if i % 500000 == 0:
                logger.info("%d / %d" % (i, self.__len__()))
            # if in and out are the same : do nothing, append same result
            if out_nuc != in_nuc:
                # remove out from counters
                if out_nuc in self._dict_nuc:
                    dict_counts[out_nuc] -= 1
                if in_nuc in self._dict_nuc:
                    dict_counts[in_nuc] += 1
                sumGC = float(dict_counts['G'] + dict_counts['C'])
                sumAT = float(dict_counts['A'] + dict_counts['T'])

            # fill results
            # GC
            GC_content_slide[0][i] = sumGC
            if sumGC > 0:
                GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C'])/sumGC
            # AT
            AT_content_slide[0][i] = sumAT
            if sumAT > 0:
                AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T'])/sumAT
            # cumul
            if in_nuc in self._dict_nuc:
                self._cumul[self._dict_nuc[in_nuc]][i+self._window-1] +=1

        self._GC_content_slide = GC_content_slide/float(self._window)
        self._AT_content_slide = AT_content_slide/float(self._window)
        self._cumul = np.delete(self._cumul, range(self.__len__(),self._cumul.shape[1]),1)
        self._cumul = np.cumsum(self._cumul,axis=1)

        ### save result for Z curve
        self._Xn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['G']]) -\
         (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['T']]))

        self._Yn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['C']]) -\
         (self._cumul[self._dict_nuc['G']] + self._cumul[self._dict_nuc['T']]))

        self._Zn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['T']]) -\
         (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['G']]))

        self._AT_skew_slide = AT_skew_slide
        self._GC_skew_slide = GC_skew_slide

        ### check proportion of ignored nucleotides
        GC_content_total = (self._cumul[self._dict_nuc['G']][-1] +
            self._cumul[self._dict_nuc['C']][-1]) / float(self.__len__())
        AT_content_total = (self._cumul[self._dict_nuc['A']][-1] +
            self._cumul[self._dict_nuc['T']][-1]) / float(self.__len__())
        self._ignored_nuc = 1.0 - GC_content_total - AT_content_total