def chipdata(self, data): """Input data: [(dirname0, [et0, et1, ...]), ...] """ self.numRowsMissingChipData = 0 self._chipdataMA = [] if data != None: self._chipdata = data numValsAll = 0 numValsNonMasked = 0 numFiles = 0 numExamplesList = [] attribDict = {} numColMissing = 0 for (name, etList) in data: numFiles += len(etList) self._chipdataMA.append((name, [])) for et in etList: attribDict.update( dict( zip(map(lambda x: x.name, et.domain.attributes), et.domain.attributes))) numExamplesList.append(len(et)) etm = et.toNumpyMA("a")[0] colNonMissingInd = Numeric.compress( Numeric.not_equal(MA.count(etm, 0), 0), Numeric.arange(etm.shape[1]) ) # indices of columns that are not completely missing numColMissing += etm.shape[1] - colNonMissingInd.shape[0] self.numRowsMissingChipData += int( Numeric.add.reduce( Numeric.less( MA.count(etm.take(colNonMissingInd, 1), 1), etm.shape[1]))) numValsAll += int(Numeric.multiply.reduce(etm.shape)) numValsNonMasked += int(MA.count(etm)) self._chipdataMA[-1][1].append(etm) # info text self.infoc.setText( "Structured Data: %i data files with %i profiles on %i points" % (numFiles, numExamplesList[0], len(attribDict))) numTotalMissing = numValsAll - numValsNonMasked if numTotalMissing > 0: print numTotalMissing, numColMissing, self.numRowsMissingChipData print type(numTotalMissing), type(numColMissing), type( self.numRowsMissingChipData) self.infod.setText( "missing %i values, %i column%s completely, %i row%s partially" % (numTotalMissing, numColMissing, [ "", "s" ][numColMissing != 1], self.numRowsMissingChipData, ["", "s"][self.numRowsMissingChipData != 1])) else: self.infod.setText("") else: self._chipdata = None self.infoc.setText("No structured data on input") self.infod.setText("") self.setGuiCommonExpChip() if self.commitOnChange: self.senddata(2)
def kNNimputeMA(arr2d, K=20, callback=None): """Returns a new 2D MA.array with missing values imputed from K nearest neighbours. Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance. Imputed value = weighted average of the corresponding values of K nearest neighbours, where weights equal to tricubic distribution of distances to all rows. Impute missing rows by average over all rows. Version: 30.8.2005 """ arr2d = MA.asarray(arr2d) assert len(arr2d.shape) == 2, "2D array expected" # make a copy for imputation aImp2 = MA.array(arr2d) # leave out columns with 0 known values (columnInd: non-zero columns) columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0) columnIndAll = Numeric.arange(arr2d.shape[1]) columnInd = Numeric.compress(columnCond, columnIndAll) # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values countByRows = MA.count(arr2d, axis=1) for rowIdx in Numeric.compress(Numeric.logical_and(Numeric.greater(countByRows, 0), Numeric.less(countByRows, columnInd.shape[0])), Numeric.arange(arr2d.shape[0])): rowResized = MA.resize(arr2d[rowIdx], arr2d.shape) diff = arr2d - rowResized distances = MA.sqrt(MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1)) # nearest neighbours row indices (without the current row index) indSorted = MA.argsort(distances)[1:] distSorted = distances.take(indSorted) # number of distances different from MA.masked numNonMasked = distSorted.shape[0] - Numeric.add.reduce(Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int)) # number of distances to account for (K or less) if numNonMasked > 1: weightsSorted = MA.power(1-MA.power(distSorted/distSorted[numNonMasked-1],3),3) # tricubic distribution of all weights else: weightsSorted = Numeric.ones(distSorted.shape[0]) # compute average for each column separately in order to account for K non-masked values colInd4CurrRow = Numeric.compress(Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond), columnIndAll) for colIdx in colInd4CurrRow: # column values sorted by distances columnVals = arr2d[:,colIdx].take(indSorted) # take only those weights where columnVals does not equal MA.masked weightsSortedCompressed = MA.compress(1-MA.getmaskarray(columnVals), weightsSorted) # impute from K (or possibly less) values aImp2[rowIdx,colIdx] = MA.average(columnVals.compressed()[:K], weights=weightsSortedCompressed[:K]) if callback: callback() # impute the unknown rows with average profile avrgRow = MA.average(arr2d, 0) for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0), Numeric.arange(arr2d.shape[0])): aImp2[rowIdx] = avrgRow if callback: callback() return aImp2
def anova2(self, ma3d, groupLens, addInteraction, repMeasuresOnA, callback): """Conducts two-way ANOVA on individual examples; returns a Numeric array of p-values in shape (2, numExamples) or (3, numExamples), depending whether we test for interaction; Note: levels of factors A and B that cause empty cells are removed prior to conducting ANOVA. """ groupLens = Numeric.asarray(groupLens) # arrays to store p-vals if addInteraction: ps = Numeric.ones((3, ma3d.shape[0]), Numeric.Float) else: ps = Numeric.ones((2, ma3d.shape[0]), Numeric.Float) # decide between non-repeated / repeated measures ANOVA for factor time if repMeasuresOnA: fAnova = Anova.AnovaRM12LR else: fAnova = Anova.Anova2wayLR # check for empty cells for all genes at once and remove them tInd2rem = [] ax2Ind = Numeric.concatenate(([0], Numeric.add.accumulate(groupLens))) for aIdx in range(ma3d.shape[1]): for rIdx in range(groupLens.shape[0]): if Numeric.add.reduce(MA.count(ma3d[:,aIdx,ax2Ind[rIdx]:ax2Ind[rIdx+1]],1)) == 0: tInd2rem.append(aIdx) break if len(tInd2rem) > 0: print "Warning: removing time indices %s for all genes" % (str(tInd2rem)) tInd2keep = range(ma3d.shape[1]) for aIdx in tInd2rem: tInd2keep.remove(aIdx) ma3d = ma3d.take(tInd2keep, 1) # for each gene... for eIdx in range(ma3d.shape[0]): # faster check for empty cells for that gene -> remove time indices with empty cells ma2d = ma3d[eIdx] cellCount = MA.zeros((ma2d.shape[0], groupLens.shape[0]), Numeric.Int) for g,(i0,i1) in enumerate(zip(ax2Ind[:-1], ax2Ind[1:])): cellCount[:,g] = MA.count(ma2d[:,i0:i1], 1) ma2dTakeInd = Numeric.logical_not(Numeric.add.reduce(Numeric.equal(cellCount,0),1)) # 1 where to take, 0 where not to take if Numeric.add.reduce(ma2dTakeInd) != ma2dTakeInd.shape[0]: print "Warning: removing time indices %s for gene %i" % (str(Numeric.compress(ma2dTakeInd == 0, Numeric.arange(ma2dTakeInd.shape[0]))), eIdx) ma2d = MA.compress(ma2dTakeInd, ma2d, 0) an = fAnova(ma2d, groupLens, addInteraction, allowReductA=True, allowReductB=True) ps[:,eIdx] = an.ps callback() return ps
def distSpearmanW(x, y, w): """weighted distance corresponding to 1 - spearman's correlation coefficient for arrays x,y and weights w returns distance: 1 - spearman_r """ distSpearFunc = _distSpearmanW_NU for var in (x, y, w): if type(var) == MA.array and MA.count(var) != Numeric.multiply.reduce( var.shape): distSpearFunc = _distSpearmanW_MA break return distSpearFunc(x, y, w)
def data(self, data): if data != None: self._data = data ## self._dataMA = chipstat.orng2ma(data) self._dataMA = data.toNumpyMA("a")[0] # info text self.infoa.setText("Examples: %i profiles on %i points" % (self._dataMA.shape[0], self._dataMA.shape[1])) numTotalMissing = int( Numeric.multiply.reduce(self._dataMA.shape) - MA.count(self._dataMA)) if numTotalMissing > 0: numValsByCol = MA.count(self._dataMA, 0) numEmptyCol = Numeric.add.reduce( Numeric.where(numValsByCol == 0, 1, 0)) colNonEmpty = Numeric.compress( numValsByCol != 0, Numeric.arange(self._dataMA.shape[1])) dataRemEmptyCol = self._dataMA.take(colNonEmpty, 1) self.numRowsMissing = Numeric.add.reduce( Numeric.where( MA.count(dataRemEmptyCol, 1) < dataRemEmptyCol.shape[1], 1, 0)) s1 = "" s2 = "" if numEmptyCol > 0: s1 = "s" if self.numRowsMissing > 0: s2 = "s" self.infob.setText( "missing %i values, %i column%s completely, %i row%s partially" % (numTotalMissing, numEmptyCol, s1, self.numRowsMissing, s2)) else: self.infob.setText("") else: self._data = None self._dataMA = None self.infoa.setText("No examples on input") self.infob.setText("") self.numRowsMissing = 0 self.setGuiCommonExpChip() if self.commitOnChange: self.senddata(1)
def rankDataMA(m, inverse=False): """Returns ranks of 1D masked array; masked values ignored, range 1...#non-masked_values. """ m = MA.asarray(m) assert MA.rank(m) == 1 fill_val = m.fill_value() m.set_fill_value(MA.maximum(m) + 1) r = MA.zeros(m.shape[0], Numeric.Float) MA.put(r, MA.argsort(m), Numeric.arange(m.shape[0])) m.set_fill_value(fill_val) r = MA.array(r, mask=MA.getmaskarray(m)) if inverse: return -1*r+MA.count(m) else: return r+1
def kNNimputeMA(arr2d, K=20, callback=None): """Returns a new 2D MA.array with missing values imputed from K nearest neighbours. Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance. Imputed value = weighted average of the corresponding values of K nearest neighbours, where weights equal to tricubic distribution of distances to all rows. Impute missing rows by average over all rows. Version: 30.8.2005 """ arr2d = MA.asarray(arr2d) assert len(arr2d.shape) == 2, "2D array expected" # make a copy for imputation aImp2 = MA.array(arr2d) # leave out columns with 0 known values (columnInd: non-zero columns) columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0) columnIndAll = Numeric.arange(arr2d.shape[1]) columnInd = Numeric.compress(columnCond, columnIndAll) # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values countByRows = MA.count(arr2d, axis=1) for rowIdx in Numeric.compress( Numeric.logical_and(Numeric.greater(countByRows, 0), Numeric.less(countByRows, columnInd.shape[0])), Numeric.arange(arr2d.shape[0])): rowResized = MA.resize(arr2d[rowIdx], arr2d.shape) diff = arr2d - rowResized distances = MA.sqrt( MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1)) # nearest neighbours row indices (without the current row index) indSorted = MA.argsort(distances)[1:] distSorted = distances.take(indSorted) # number of distances different from MA.masked numNonMasked = distSorted.shape[0] - Numeric.add.reduce( Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int)) # number of distances to account for (K or less) if numNonMasked > 1: weightsSorted = MA.power( 1 - MA.power(distSorted / distSorted[numNonMasked - 1], 3), 3) # tricubic distribution of all weights else: weightsSorted = Numeric.ones(distSorted.shape[0]) # compute average for each column separately in order to account for K non-masked values colInd4CurrRow = Numeric.compress( Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond), columnIndAll) for colIdx in colInd4CurrRow: # column values sorted by distances columnVals = arr2d[:, colIdx].take(indSorted) # take only those weights where columnVals does not equal MA.masked weightsSortedCompressed = MA.compress( 1 - MA.getmaskarray(columnVals), weightsSorted) # impute from K (or possibly less) values aImp2[rowIdx, colIdx] = MA.average(columnVals.compressed()[:K], weights=weightsSortedCompressed[:K]) if callback: callback() # impute the unknown rows with average profile avrgRow = MA.average(arr2d, 0) for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0), Numeric.arange(arr2d.shape[0])): aImp2[rowIdx] = avrgRow if callback: callback() return aImp2