def normalize(trackList, method='total', field='score'): """Normalizes the scores in every stream from *trackList* using the given *method*. It assumes that each of the streams represents the same features, i.e. the n-th element of one stream corresponds to the n-th element of another. [!] This function will temporarily store everything in memory. :param trackList: FeatureStream, or list of FeatureStream objects. :param method: normalization method: * ``'total'`` divides every score vector by its sum (total number of reads) x 10^7 . * ``'deseq'`` applies DESeq's normalization ("size factors") - considering every track as belonging to a different group. * ``'quantile'`` applies quantile normalization. :param field: (str) name of the field containing the scores (must be the same for all streams). """ if not isinstance(trackList, (list, tuple)): trackList = [trackList] allcontents = [list(t) for t in trackList] ncols = len(trackList) nlines = len(allcontents[0]) assert all( len(t) == nlines for t in allcontents), "All streams must have the same number of elements." # Build the matrix allscores = zeros((ncols, nlines)) for n, content in enumerate(allcontents): idx = trackList[n].fields.index(field) allscores[n] = asarray([x[idx] for x in content]) # Normalize allscores = common.normalize(asarray(allscores), method) # Reinsert the new scores in the respective tracks for n, content in enumerate(allcontents): idx = trackList[n].fields.index(field) for k, x in enumerate(content): content[k] = x[:idx] + (allscores[n][k], ) + x[idx + 1:] res = [ FeatureStream(t, fields=trackList[n].fields) for n, t in enumerate(allcontents) ] if len(trackList) == 1: return res[0] else: return res
def __call__(self, **kw): filename = kw.get('table') assert os.path.exists(str(filename)), "File not found: '%s'" % filename file = open(filename, 'r') header = file.readline() id = [] matrix = [] for line in file: newline = line.split() id.append(newline[0]) matrix.append(map(int, newline[1:len(header)])) norm = common.normalize(asarray(matrix).transpose(), kw.get('method')) output = self.temporary_path(fname='output.tab') out = open(output, "w") out.write(header) for i in range(len(norm[0])): out.write(str(id[i])+"\t"+str(map(lambda x: "%.2g" % x, list(norm.transpose()[i]))).replace("'","").replace("[","").replace("]","").replace(", ","\t")+"\n") self.new_file(output, 'normalized') return self.display_time()
def normalize(trackList, method="total", field="score"): """Normalizes the scores in every stream from *trackList* using the given *method*. It assumes that each of the streams represents the same features, i.e. the n-th element of one stream corresponds to the n-th element of another. [!] This function will temporarily store everything in memory. :param trackList: FeatureStream, or list of FeatureStream objects. :param method: normalization method: * ``'total'`` divides every score vector by its sum (total number of reads) x 10^7 . * ``'deseq'`` applies DESeq's normalization ("size factors") - considering every track as belonging to a different group. * ``'quantile'`` applies quantile normalization. :param field: (str) name of the field containing the scores (must be the same for all streams). """ if not isinstance(trackList, (list, tuple)): trackList = [trackList] allcontents = [list(t) for t in trackList] ncols = len(trackList) nlines = len(allcontents[0]) assert all(len(t) == nlines for t in allcontents), "All streams must have the same number of elements." # Build the matrix allscores = zeros((ncols, nlines)) for n, content in enumerate(allcontents): idx = trackList[n].fields.index(field) allscores[n] = asarray([x[idx] for x in content]) # Normalize allscores = common.normalize(asarray(allscores), method) # Reinsert the new scores in the respective tracks for n, content in enumerate(allcontents): idx = trackList[n].fields.index(field) for k, x in enumerate(content): content[k] = x[:idx] + (allscores[n][k],) + x[idx + 1 :] res = [FeatureStream(t, fields=trackList[n].fields) for n, t in enumerate(allcontents)] if len(trackList) == 1: return res[0] else: return res
def __call__(self, **kw): filename = kw.get('table') assert os.path.exists(str(filename)), "File not found: '%s'" % filename file = open(filename, 'r') header = file.readline() id = [] matrix = [] for line in file: newline = line.split() id.append(newline[0]) matrix.append(map(int, newline[1:len(header)])) norm = common.normalize(asarray(matrix).transpose(), kw.get('method')) output = self.temporary_path(fname='output.tab') out = open(output, "w") out.write(header) for i in range(len(norm[0])): out.write( str(id[i]) + "\t" + str(map(lambda x: "%.2g" % x, list( norm.transpose()[i]))).replace("'", "").replace( "[", "").replace("]", "").replace(", ", "\t") + "\n") self.new_file(output, 'normalized') return self.display_time()