示例#1
0
 def chipdata(self, data):
     """Input data: [(dirname0, [et0, et1, ...]), ...]
     """
     self.numRowsMissingChipData = 0
     self._chipdataMA = []
     if data != None:
         self._chipdata = data
         numValsAll = 0
         numValsNonMasked = 0
         numFiles = 0
         numExamplesList = []
         attribDict = {}
         numColMissing = 0
         for (name, etList) in data:
             numFiles += len(etList)
             self._chipdataMA.append((name, []))
             for et in etList:
                 attribDict.update(
                     dict(
                         zip(map(lambda x: x.name, et.domain.attributes),
                             et.domain.attributes)))
                 numExamplesList.append(len(et))
                 etm = et.toNumpyMA("a")[0]
                 colNonMissingInd = Numeric.compress(
                     Numeric.not_equal(MA.count(etm, 0), 0),
                     Numeric.arange(etm.shape[1])
                 )  # indices of columns that are not completely missing
                 numColMissing += etm.shape[1] - colNonMissingInd.shape[0]
                 self.numRowsMissingChipData += int(
                     Numeric.add.reduce(
                         Numeric.less(
                             MA.count(etm.take(colNonMissingInd, 1), 1),
                             etm.shape[1])))
                 numValsAll += int(Numeric.multiply.reduce(etm.shape))
                 numValsNonMasked += int(MA.count(etm))
                 self._chipdataMA[-1][1].append(etm)
         # info text
         self.infoc.setText(
             "Structured Data: %i data files with %i profiles on %i points"
             % (numFiles, numExamplesList[0], len(attribDict)))
         numTotalMissing = numValsAll - numValsNonMasked
         if numTotalMissing > 0:
             print numTotalMissing, numColMissing, self.numRowsMissingChipData
             print type(numTotalMissing), type(numColMissing), type(
                 self.numRowsMissingChipData)
             self.infod.setText(
                 "missing %i values, %i column%s completely, %i row%s partially"
                 % (numTotalMissing, numColMissing, [
                     "", "s"
                 ][numColMissing != 1], self.numRowsMissingChipData,
                    ["", "s"][self.numRowsMissingChipData != 1]))
         else:
             self.infod.setText("")
     else:
         self._chipdata = None
         self.infoc.setText("No structured data on input")
         self.infod.setText("")
     self.setGuiCommonExpChip()
     if self.commitOnChange:
         self.senddata(2)
示例#2
0
def kNNimputeMA(arr2d, K=20, callback=None):
    """Returns a new 2D MA.array with missing values imputed from K nearest neighbours.
    Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance.
    Imputed value = weighted average of the corresponding values of K nearest neighbours,
    where weights equal to tricubic distribution of distances to all rows.
    Impute missing rows by average over all rows.
    Version: 30.8.2005
    """
    arr2d = MA.asarray(arr2d)
    assert len(arr2d.shape) == 2, "2D array expected"
    # make a copy for imputation
    aImp2 = MA.array(arr2d)
    # leave out columns with 0 known values (columnInd: non-zero columns)
    columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0)
    columnIndAll = Numeric.arange(arr2d.shape[1])
    columnInd = Numeric.compress(columnCond, columnIndAll)
    # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values
    countByRows = MA.count(arr2d, axis=1)
    for rowIdx in Numeric.compress(Numeric.logical_and(Numeric.greater(countByRows, 0), Numeric.less(countByRows, columnInd.shape[0])), Numeric.arange(arr2d.shape[0])):
        rowResized = MA.resize(arr2d[rowIdx], arr2d.shape)
        diff = arr2d - rowResized
        distances = MA.sqrt(MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1))
        # nearest neighbours row indices (without the current row index)
        indSorted = MA.argsort(distances)[1:]
        distSorted = distances.take(indSorted)
        # number of distances different from MA.masked
        numNonMasked = distSorted.shape[0] - Numeric.add.reduce(Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int))
        # number of distances to account for (K or less)
        if numNonMasked > 1:
            weightsSorted = MA.power(1-MA.power(distSorted/distSorted[numNonMasked-1],3),3) # tricubic distribution of all weights
        else:
            weightsSorted = Numeric.ones(distSorted.shape[0])
        # compute average for each column separately in order to account for K non-masked values
        colInd4CurrRow = Numeric.compress(Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond), columnIndAll)
        for colIdx in colInd4CurrRow:
            # column values sorted by distances
            columnVals = arr2d[:,colIdx].take(indSorted)
            # take only those weights where columnVals does not equal MA.masked
            weightsSortedCompressed = MA.compress(1-MA.getmaskarray(columnVals), weightsSorted)
            # impute from K (or possibly less) values
            aImp2[rowIdx,colIdx] = MA.average(columnVals.compressed()[:K], weights=weightsSortedCompressed[:K])
        if callback:
            callback()
    # impute the unknown rows with average profile
    avrgRow = MA.average(arr2d, 0)
    for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0), Numeric.arange(arr2d.shape[0])):
        aImp2[rowIdx] = avrgRow
        if callback:
            callback()
    return aImp2
示例#3
0
 def anova2(self, ma3d, groupLens, addInteraction, repMeasuresOnA, callback):
     """Conducts two-way ANOVA on individual examples;
     returns a Numeric array of p-values in shape (2, numExamples) or (3, numExamples), depending whether we test for interaction;
     Note: levels of factors A and B that cause empty cells are removed prior to conducting ANOVA.
     """
     groupLens = Numeric.asarray(groupLens)
     # arrays to store p-vals
     if addInteraction:
         ps = Numeric.ones((3, ma3d.shape[0]), Numeric.Float)
     else:
         ps = Numeric.ones((2, ma3d.shape[0]), Numeric.Float)
     # decide between non-repeated / repeated measures ANOVA for factor time
     if repMeasuresOnA:
         fAnova = Anova.AnovaRM12LR
     else:
         fAnova = Anova.Anova2wayLR
     # check for empty cells for all genes at once and remove them
     tInd2rem = []
     ax2Ind = Numeric.concatenate(([0], Numeric.add.accumulate(groupLens)))
     for aIdx in range(ma3d.shape[1]):
         for rIdx in range(groupLens.shape[0]):
             if Numeric.add.reduce(MA.count(ma3d[:,aIdx,ax2Ind[rIdx]:ax2Ind[rIdx+1]],1)) == 0:
                 tInd2rem.append(aIdx)
                 break
     if len(tInd2rem) > 0:
         print "Warning: removing time indices %s for all genes" % (str(tInd2rem))
         tInd2keep = range(ma3d.shape[1])
         for aIdx in tInd2rem:
             tInd2keep.remove(aIdx)
         ma3d = ma3d.take(tInd2keep, 1)
     # for each gene...
     for eIdx in range(ma3d.shape[0]):
         # faster check for empty cells for that gene -> remove time indices with empty cells
         ma2d = ma3d[eIdx]
         cellCount = MA.zeros((ma2d.shape[0], groupLens.shape[0]), Numeric.Int)
         for g,(i0,i1) in enumerate(zip(ax2Ind[:-1], ax2Ind[1:])):
             cellCount[:,g] = MA.count(ma2d[:,i0:i1], 1)
         ma2dTakeInd = Numeric.logical_not(Numeric.add.reduce(Numeric.equal(cellCount,0),1)) # 1 where to take, 0 where not to take
         if Numeric.add.reduce(ma2dTakeInd) != ma2dTakeInd.shape[0]:
             print "Warning: removing time indices %s for gene %i" % (str(Numeric.compress(ma2dTakeInd == 0, Numeric.arange(ma2dTakeInd.shape[0]))), eIdx)
             ma2d = MA.compress(ma2dTakeInd, ma2d, 0)
         an = fAnova(ma2d, groupLens, addInteraction, allowReductA=True, allowReductB=True)
         ps[:,eIdx] = an.ps
         callback()
     return ps
示例#4
0
def distSpearmanW(x, y, w):
    """weighted distance corresponding to 1 - spearman's correlation coefficient for arrays x,y and weights w
    returns distance: 1 - spearman_r
    """
    distSpearFunc = _distSpearmanW_NU
    for var in (x, y, w):
        if type(var) == MA.array and MA.count(var) != Numeric.multiply.reduce(
                var.shape):
            distSpearFunc = _distSpearmanW_MA
            break
    return distSpearFunc(x, y, w)
示例#5
0
 def data(self, data):
     if data != None:
         self._data = data
         ##            self._dataMA = chipstat.orng2ma(data)
         self._dataMA = data.toNumpyMA("a")[0]
         # info text
         self.infoa.setText("Examples: %i profiles on %i points" %
                            (self._dataMA.shape[0], self._dataMA.shape[1]))
         numTotalMissing = int(
             Numeric.multiply.reduce(self._dataMA.shape) -
             MA.count(self._dataMA))
         if numTotalMissing > 0:
             numValsByCol = MA.count(self._dataMA, 0)
             numEmptyCol = Numeric.add.reduce(
                 Numeric.where(numValsByCol == 0, 1, 0))
             colNonEmpty = Numeric.compress(
                 numValsByCol != 0, Numeric.arange(self._dataMA.shape[1]))
             dataRemEmptyCol = self._dataMA.take(colNonEmpty, 1)
             self.numRowsMissing = Numeric.add.reduce(
                 Numeric.where(
                     MA.count(dataRemEmptyCol, 1) <
                     dataRemEmptyCol.shape[1], 1, 0))
             s1 = ""
             s2 = ""
             if numEmptyCol > 0: s1 = "s"
             if self.numRowsMissing > 0: s2 = "s"
             self.infob.setText(
                 "missing %i values, %i column%s completely, %i row%s partially"
                 % (numTotalMissing, numEmptyCol, s1, self.numRowsMissing,
                    s2))
         else:
             self.infob.setText("")
     else:
         self._data = None
         self._dataMA = None
         self.infoa.setText("No examples on input")
         self.infob.setText("")
         self.numRowsMissing = 0
     self.setGuiCommonExpChip()
     if self.commitOnChange:
         self.senddata(1)
示例#6
0
def rankDataMA(m, inverse=False):
    """Returns ranks of 1D masked array; masked values ignored, range 1...#non-masked_values.
    """
    m = MA.asarray(m)
    assert MA.rank(m) == 1
    fill_val = m.fill_value()
    m.set_fill_value(MA.maximum(m) + 1)
    r = MA.zeros(m.shape[0], Numeric.Float)
    MA.put(r, MA.argsort(m), Numeric.arange(m.shape[0]))
    m.set_fill_value(fill_val)
    r = MA.array(r, mask=MA.getmaskarray(m))
    if inverse:
        return -1*r+MA.count(m)
    else:
        return r+1
示例#7
0
def kNNimputeMA(arr2d, K=20, callback=None):
    """Returns a new 2D MA.array with missing values imputed from K nearest neighbours.
    Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance.
    Imputed value = weighted average of the corresponding values of K nearest neighbours,
    where weights equal to tricubic distribution of distances to all rows.
    Impute missing rows by average over all rows.
    Version: 30.8.2005
    """
    arr2d = MA.asarray(arr2d)
    assert len(arr2d.shape) == 2, "2D array expected"
    # make a copy for imputation
    aImp2 = MA.array(arr2d)
    # leave out columns with 0 known values (columnInd: non-zero columns)
    columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0)
    columnIndAll = Numeric.arange(arr2d.shape[1])
    columnInd = Numeric.compress(columnCond, columnIndAll)
    # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values
    countByRows = MA.count(arr2d, axis=1)
    for rowIdx in Numeric.compress(
            Numeric.logical_and(Numeric.greater(countByRows, 0),
                                Numeric.less(countByRows, columnInd.shape[0])),
            Numeric.arange(arr2d.shape[0])):
        rowResized = MA.resize(arr2d[rowIdx], arr2d.shape)
        diff = arr2d - rowResized
        distances = MA.sqrt(
            MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1))
        # nearest neighbours row indices (without the current row index)
        indSorted = MA.argsort(distances)[1:]
        distSorted = distances.take(indSorted)
        # number of distances different from MA.masked
        numNonMasked = distSorted.shape[0] - Numeric.add.reduce(
            Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int))
        # number of distances to account for (K or less)
        if numNonMasked > 1:
            weightsSorted = MA.power(
                1 - MA.power(distSorted / distSorted[numNonMasked - 1], 3),
                3)  # tricubic distribution of all weights
        else:
            weightsSorted = Numeric.ones(distSorted.shape[0])
        # compute average for each column separately in order to account for K non-masked values
        colInd4CurrRow = Numeric.compress(
            Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond),
            columnIndAll)
        for colIdx in colInd4CurrRow:
            # column values sorted by distances
            columnVals = arr2d[:, colIdx].take(indSorted)
            # take only those weights where columnVals does not equal MA.masked
            weightsSortedCompressed = MA.compress(
                1 - MA.getmaskarray(columnVals), weightsSorted)
            # impute from K (or possibly less) values
            aImp2[rowIdx,
                  colIdx] = MA.average(columnVals.compressed()[:K],
                                       weights=weightsSortedCompressed[:K])
        if callback:
            callback()
    # impute the unknown rows with average profile
    avrgRow = MA.average(arr2d, 0)
    for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0),
                                   Numeric.arange(arr2d.shape[0])):
        aImp2[rowIdx] = avrgRow
        if callback:
            callback()
    return aImp2