def _compute_skews(self): ### initialisation = Calculating GC skew and AT skew for first window self._init_sliding_window() GC_content_slide, GC_skew_slide = self._init_list_results() AT_content_slide, AT_skew_slide = self._init_list_results() self._init_cumul_nuc() c = Counter(self._slide_window) dict_counts = {'G': c['G'], 'C': c['C'], 'A': c['A'], 'T': c['T']} i = 0 # GC sumGC = float(dict_counts['G'] + dict_counts['C']) GC_content_slide[0][i] = sumGC if sumGC > 0: GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C']) / sumGC # AT sumAT = float(dict_counts['A'] + dict_counts['T']) AT_content_slide[0][i] = sumAT if sumAT > 0: AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T']) / sumAT ### Compute for all genome while (self._seq_right): out_nuc = self._slide_window.popleft() in_nuc = self._seq_right.popleft() self._slide_window.append(in_nuc) i += 1 if i % 500000 == 0: logger.info("%d / %d" % (i, self.__len__())) # if in and out are the same : do nothing, append same result if out_nuc != in_nuc: # remove out from counters if out_nuc in self._dict_nuc: dict_counts[out_nuc] -= 1 if in_nuc in self._dict_nuc: dict_counts[in_nuc] += 1 sumGC = float(dict_counts['G'] + dict_counts['C']) sumAT = float(dict_counts['A'] + dict_counts['T']) # fill results # GC GC_content_slide[0][i] = sumGC if sumGC > 0: GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C']) / sumGC # AT AT_content_slide[0][i] = sumAT if sumAT > 0: AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T']) / sumAT # cumul if in_nuc in self._dict_nuc: self._cumul[self._dict_nuc[in_nuc]][i + self._window - 1] += 1 self._GC_content_slide = GC_content_slide / float(self._window) self._AT_content_slide = AT_content_slide / float(self._window) self._cumul = np.delete(self._cumul, range(self.__len__(), self._cumul.shape[1]), 1) self._cumul = np.cumsum(self._cumul, axis=1) ### save result for Z curve self._Xn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['G']]) -\ (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['T']])) self._Yn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['C']]) -\ (self._cumul[self._dict_nuc['G']] + self._cumul[self._dict_nuc['T']])) self._Zn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['T']]) -\ (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['G']])) self._AT_skew_slide = AT_skew_slide self._GC_skew_slide = GC_skew_slide ### check proportion of ignored nucleotides GC_content_total = (self._cumul[self._dict_nuc['G']][-1] + self._cumul[self._dict_nuc['C']][-1]) / float( self.__len__()) AT_content_total = (self._cumul[self._dict_nuc['A']][-1] + self._cumul[self._dict_nuc['T']][-1]) / float( self.__len__()) self._ignored_nuc = 1.0 - GC_content_total - AT_content_total
def _compute_skews(self): ### initialisation = Calculating GC skew and AT skew for first window self._init_sliding_window() GC_content_slide, GC_skew_slide = self._init_list_results() AT_content_slide, AT_skew_slide = self._init_list_results() self._init_cumul_nuc() c = Counter(self._slide_window) dict_counts = {'G' : c['G'], 'C' : c['C'], 'A' : c['A'], 'T' : c['T']} i = 0 # GC sumGC = float(dict_counts['G'] + dict_counts['C']) GC_content_slide[0][i] = sumGC if sumGC > 0: GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C']) / sumGC # AT sumAT = float(dict_counts['A'] + dict_counts['T']) AT_content_slide[0][i] = sumAT if sumAT > 0: AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T']) / sumAT ### Compute for all genome while(self._seq_right): out_nuc = self._slide_window.popleft() in_nuc = self._seq_right.popleft() self._slide_window.append(in_nuc) i += 1 if i % 500000 == 0: logger.info("%d / %d" % (i, self.__len__())) # if in and out are the same : do nothing, append same result if out_nuc != in_nuc: # remove out from counters if out_nuc in self._dict_nuc: dict_counts[out_nuc] -= 1 if in_nuc in self._dict_nuc: dict_counts[in_nuc] += 1 sumGC = float(dict_counts['G'] + dict_counts['C']) sumAT = float(dict_counts['A'] + dict_counts['T']) # fill results # GC GC_content_slide[0][i] = sumGC if sumGC > 0: GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C'])/sumGC # AT AT_content_slide[0][i] = sumAT if sumAT > 0: AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T'])/sumAT # cumul if in_nuc in self._dict_nuc: self._cumul[self._dict_nuc[in_nuc]][i+self._window-1] +=1 self._GC_content_slide = GC_content_slide/float(self._window) self._AT_content_slide = AT_content_slide/float(self._window) self._cumul = np.delete(self._cumul, range(self.__len__(),self._cumul.shape[1]),1) self._cumul = np.cumsum(self._cumul,axis=1) ### save result for Z curve self._Xn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['G']]) -\ (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['T']])) self._Yn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['C']]) -\ (self._cumul[self._dict_nuc['G']] + self._cumul[self._dict_nuc['T']])) self._Zn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['T']]) -\ (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['G']])) self._AT_skew_slide = AT_skew_slide self._GC_skew_slide = GC_skew_slide ### check proportion of ignored nucleotides GC_content_total = (self._cumul[self._dict_nuc['G']][-1] + self._cumul[self._dict_nuc['C']][-1]) / float(self.__len__()) AT_content_total = (self._cumul[self._dict_nuc['A']][-1] + self._cumul[self._dict_nuc['T']][-1]) / float(self.__len__()) self._ignored_nuc = 1.0 - GC_content_total - AT_content_total