Exemplo n.º 1
0
def normalize(trackList, method='total', field='score'):
    """Normalizes the scores in every stream from *trackList* using the given *method*.
    It assumes that each of the streams represents the same features, i.e. the n-th element
    of one stream corresponds to the n-th element of another.

    [!] This function will temporarily store everything in memory.

    :param trackList: FeatureStream, or list of FeatureStream objects.
    :param method: normalization method:
        * ``'total'`` divides every score vector by its sum (total number of reads) x 10^7 .
        * ``'deseq'`` applies DESeq's normalization ("size factors") - considering every track
            as belonging to a different group.
        * ``'quantile'`` applies quantile normalization.
    :param field: (str) name of the field containing the scores (must be the same for all streams).
    """
    if not isinstance(trackList, (list, tuple)):
        trackList = [trackList]
    allcontents = [list(t) for t in trackList]
    ncols = len(trackList)
    nlines = len(allcontents[0])
    assert all(
        len(t) == nlines for t in
        allcontents), "All streams must have the same number of elements."
    # Build the matrix
    allscores = zeros((ncols, nlines))
    for n, content in enumerate(allcontents):
        idx = trackList[n].fields.index(field)
        allscores[n] = asarray([x[idx] for x in content])
    # Normalize
    allscores = common.normalize(asarray(allscores), method)
    # Reinsert the new scores in the respective tracks
    for n, content in enumerate(allcontents):
        idx = trackList[n].fields.index(field)
        for k, x in enumerate(content):
            content[k] = x[:idx] + (allscores[n][k], ) + x[idx + 1:]
    res = [
        FeatureStream(t, fields=trackList[n].fields)
        for n, t in enumerate(allcontents)
    ]
    if len(trackList) == 1:
        return res[0]
    else:
        return res
Exemplo n.º 2
0
    def __call__(self, **kw):

        filename = kw.get('table')
        assert os.path.exists(str(filename)), "File not found: '%s'" % filename
        file = open(filename, 'r')
        header = file.readline()
        id = []
        matrix = []
        for line in file:
            newline = line.split()
            id.append(newline[0])
            matrix.append(map(int, newline[1:len(header)]))
        norm = common.normalize(asarray(matrix).transpose(), kw.get('method'))
        output = self.temporary_path(fname='output.tab')
        out = open(output, "w")
        out.write(header)
        for i in range(len(norm[0])):
            out.write(str(id[i])+"\t"+str(map(lambda x: "%.2g" % x, list(norm.transpose()[i]))).replace("'","").replace("[","").replace("]","").replace(", ","\t")+"\n")
        self.new_file(output, 'normalized')
        return self.display_time()
Exemplo n.º 3
0
def normalize(trackList, method="total", field="score"):
    """Normalizes the scores in every stream from *trackList* using the given *method*.
    It assumes that each of the streams represents the same features, i.e. the n-th element
    of one stream corresponds to the n-th element of another.

    [!] This function will temporarily store everything in memory.

    :param trackList: FeatureStream, or list of FeatureStream objects.
    :param method: normalization method:
        * ``'total'`` divides every score vector by its sum (total number of reads) x 10^7 .
        * ``'deseq'`` applies DESeq's normalization ("size factors") - considering every track
            as belonging to a different group.
        * ``'quantile'`` applies quantile normalization.
    :param field: (str) name of the field containing the scores (must be the same for all streams).
    """
    if not isinstance(trackList, (list, tuple)):
        trackList = [trackList]
    allcontents = [list(t) for t in trackList]
    ncols = len(trackList)
    nlines = len(allcontents[0])
    assert all(len(t) == nlines for t in allcontents), "All streams must have the same number of elements."
    # Build the matrix
    allscores = zeros((ncols, nlines))
    for n, content in enumerate(allcontents):
        idx = trackList[n].fields.index(field)
        allscores[n] = asarray([x[idx] for x in content])
    # Normalize
    allscores = common.normalize(asarray(allscores), method)
    # Reinsert the new scores in the respective tracks
    for n, content in enumerate(allcontents):
        idx = trackList[n].fields.index(field)
        for k, x in enumerate(content):
            content[k] = x[:idx] + (allscores[n][k],) + x[idx + 1 :]
    res = [FeatureStream(t, fields=trackList[n].fields) for n, t in enumerate(allcontents)]
    if len(trackList) == 1:
        return res[0]
    else:
        return res
Exemplo n.º 4
0
    def __call__(self, **kw):

        filename = kw.get('table')
        assert os.path.exists(str(filename)), "File not found: '%s'" % filename
        file = open(filename, 'r')
        header = file.readline()
        id = []
        matrix = []
        for line in file:
            newline = line.split()
            id.append(newline[0])
            matrix.append(map(int, newline[1:len(header)]))
        norm = common.normalize(asarray(matrix).transpose(), kw.get('method'))
        output = self.temporary_path(fname='output.tab')
        out = open(output, "w")
        out.write(header)
        for i in range(len(norm[0])):
            out.write(
                str(id[i]) + "\t" +
                str(map(lambda x: "%.2g" % x, list(
                    norm.transpose()[i]))).replace("'", "").replace(
                        "[", "").replace("]", "").replace(", ", "\t") + "\n")
        self.new_file(output, 'normalized')
        return self.display_time()