def pca(data, algorithm='eig'): """pca(data) -> mean, pcs, norm_pcs, variances, positions, norm_positions Perform Principal Components Analysis on a set of n data points in k dimensions. The data array must be of shape (n, k). This function returns the transformed position of each data point along with the amount of variance captured by each principal component. The optional algorithm parameter can be either 'svd' to perform PCA with the singular value decomposition, or 'eig' to use a symmetric eigenvalue decomposition. Empirically, eig is faster on the datasets I have tested. """ if False: #scaling is disabled for i in xrange(0, len(data[0])): freqs = [] for j in xrange(0, len(data)): freqs.append(data[j][i]) meanFreq = float(sum(freqs)) / len(freqs) sdFreq = stdDev(freqs, meanFreq) for j in xrange(0, len(data)): data[j][i] -= meanFreq data[j][i] /= sdFreq data = numpy.asarray(data) mean = data.mean(axis=0) centered = data - mean if algorithm == 'eig': pcs, variances, stds, positions, norm_positions = _pca_eig(centered) elif algorithm == 'svd': pcs, variances, stds, positions, norm_positions = _pca_svd(centered) else: raise RuntimeError('Algorithm %s not known.' % algorithm) sumVariances = sum(variances) for i in xrange(0, len(variances)): variances[i] /= sumVariances return positions, variances
def pca(data, algorithm='eig'): """pca(data) -> mean, pcs, norm_pcs, variances, positions, norm_positions Perform Principal Components Analysis on a set of n data points in k dimensions. The data array must be of shape (n, k). This function returns the transformed position of each data point along with the amount of variance captured by each principal component. The optional algorithm parameter can be either 'svd' to perform PCA with the singular value decomposition, or 'eig' to use a symmetric eigenvalue decomposition. Empirically, eig is faster on the datasets I have tested. """ if False: #scaling is disabled for i in xrange(0, len(data[0])): freqs = [] for j in xrange(0, len(data)): freqs.append(data[j][i]) meanFreq = float(sum(freqs)) / len(freqs) sdFreq = stdDev(freqs, meanFreq) for j in xrange(0, len(data)): data[j][i] -= meanFreq data[j][i] /= sdFreq data = numpy.asarray(data) mean = data.mean(axis = 0) centered = data - mean if algorithm=='eig': pcs, variances, stds, positions, norm_positions = _pca_eig(centered) elif algorithm=='svd': pcs, variances, stds, positions, norm_positions = _pca_svd(centered) else: raise RuntimeError('Algorithm %s not known.'%algorithm) sumVariances = sum(variances) for i in xrange(0, len(variances)): variances[i] /= sumVariances return positions, variances
def run(self, statTest, effectSizeMeasure, profile, progress=None): self.results.data = [] self.results.test = statTest.name self.results.profile = profile if progress == 'Verbose': print ' Processing feature:' index = 0 for feature in profile.getFeatures(): if progress == 'Verbose': print ' ' + feature elif progress != None: if progress.wasCanceled(): self.results.data = [] return index += 1 progress.setValue(index) seqCount = profile.getActiveFeatureCounts(feature) parentCount = profile.getActiveParentCounts(feature) data = profile.getActiveFeatureProportions(feature) pValue, note = statTest.hypothesisTest(data) effectSize = effectSizeMeasure.run(data) row = [feature, float(pValue), float(pValue), effectSize, note] for i in xrange(0, len(seqCount)): propGroup = [] for j in xrange(0, len(seqCount[i])): sc = seqCount[i][j] pc = parentCount[i][j] if pc > 0: propGroup.append(sc * 100.0 / pc) else: propGroup.append(0.0) meanGroup = mean(propGroup) row.append(meanGroup) row.append(stdDev(propGroup, meanGroup)) for i in xrange(0, len(seqCount)): for j in xrange(0, len(seqCount[i])): sc = seqCount[i][j] pc = parentCount[i][j] row.append(sc) row.append(pc) if pc > 0: row.append(sc * 100.0 / pc) else: row.append(0.0) self.results.data.append(row) headingsSampleStats = [] for i in xrange(0, len(profile.activeSamplesInGroups)): for sampleName in profile.activeSamplesInGroups[i]: headingsSampleStats.append(sampleName) headingsSampleStats.append(sampleName + ': parent seq. count') headingsSampleStats.append(sampleName + ': rel. freq. (%)') self.results.createTableHeadings(profile.activeGroupNames, headingsSampleStats) if len(self.results.data) >= 1: # sort results according to p-values self.results.data = TableHelper.SortTable( self.results.data, [self.results.dataHeadings['pValues']]) if progress != None and progress != 'Verbose': index += 1 progress.setValue(index)
def run(self, test, signLevel, statsResults, trials, bootstrapRep, progress): tableData = [] index = 0 for row in statsResults: feature = row[0] seq1 = row[1] seq2 = row[2] parentSeq1 = row[3] parentSeq2 = row[4] p1 = float(seq1) / parentSeq1 p2 = float(seq2) / parentSeq2 powerList = [] powerListLess5 = [] powerListGreater5 = [] for trial in xrange(0, trials): if progress != '': index += 1 progress.setValue(index) progress.setLabelText(feature + ' - Trial = ' + str(trial)) power = 0 processedReplicates = 0 for dummy in xrange(0, bootstrapRep): c1 = 0 c2 = 0 for dummy in xrange(0, parentSeq1): rnd = random.random() if rnd <= p1: c1 += 1 for dummy in xrange(0, parentSeq2): rnd = random.random() if rnd <= p2: c2 += 1 if c1 == 0 and c2 == 0: # This is a special case that many hypothesis test will not handle correctly # so we just ignore it. This will have little effect on the calculated power # of a test. continue processedReplicates += 1 pValueOneSided, pValueTwoSided = test.hypothesisTest( c1, c2, parentSeq1, parentSeq2) if pValueTwoSided < signLevel: power += 1 if processedReplicates > 0: if min([seq1, seq2]) <= 5: powerListLess5.append( float(power) / processedReplicates) else: powerListGreater5.append( float(power) / processedReplicates) powerList.append(float(power) / processedReplicates) row = [] row.append(feature) row.append(seq1) row.append(seq2) row.append(parentSeq1) row.append(parentSeq2) row.append(float(seq1) / parentSeq1) row.append(float(seq2) / parentSeq2) row.append(mean(powerList)) row.append(stdDev(powerList)) if math.isnan(mean(powerListLess5)): row.append('') else: row.append(mean(powerListLess5)) if math.isnan(stdDev(powerListLess5)): row.append('') else: row.append(stdDev(powerListLess5)) if math.isnan(mean(powerListGreater5)): row.append('') else: row.append(mean(powerListGreater5)) if math.isnan(stdDev(powerListGreater5)): row.append('') else: row.append(stdDev(powerListGreater5)) tableData.append(row) return tableData
def run(self, statTest, effectSizeMeasure, profile, progress = None): self.results.data = [] self.results.test = statTest.name self.results.profile = profile if progress == 'Verbose': print ' Processing feature:' index = 0 for feature in profile.getFeatures(): if progress == 'Verbose': print ' ' + feature elif progress != None: if progress.wasCanceled(): self.results.data = [] return index += 1 progress.setValue(index) seqCount = profile.getActiveFeatureCounts(feature) parentCount = profile.getActiveParentCounts(feature) data = profile.getActiveFeatureProportions(feature) pValue, note = statTest.hypothesisTest(data) effectSize = effectSizeMeasure.run(data) row = [feature, float(pValue), float(pValue), effectSize, note] for i in xrange(0, len(seqCount)): propGroup = [] for j in xrange(0, len(seqCount[i])): propGroup.append(seqCount[i][j] * 100.0 / parentCount[i][j]) meanGroup = mean(propGroup) row.append(meanGroup) row.append(stdDev(propGroup, meanGroup)) for i in xrange(0, len(seqCount)): for j in xrange(0, len(seqCount[i])): row.append(seqCount[i][j]) row.append(parentCount[i][j]) row.append(seqCount[i][j] * 100.0 / parentCount[i][j]) self.results.data.append(row) headingsSampleStats = [] for i in xrange(0, len(profile.activeSamplesInGroups)): for sampleName in profile.activeSamplesInGroups[i]: headingsSampleStats.append(sampleName) headingsSampleStats.append(sampleName + ': parent seq. count') headingsSampleStats.append(sampleName + ': rel. freq. (%)') self.results.createTableHeadings(profile.activeGroupNames, headingsSampleStats) if len(self.results.data) >= 1: # sort results according to p-values self.results.data = TableHelper.SortTable(self.results.data, [self.results.dataHeadings['pValues']]) if progress != None and progress != 'Verbose': index += 1 progress.setValue(index)
def plot(self, profile, statsResults): if len(profile.profileDict) <= 0: self.emptyAxis() return if len(profile.profileDict) > 10000: QtGui.QApplication.instance().setOverrideCursor( QtGui.QCursor(QtCore.Qt.ArrowCursor)) reply = QtGui.QMessageBox.question( self, 'Continue?', 'Profile contains ' + str(len(profile.profileDict)) + ' features. ' + 'It may take several seconds to generate this plot. Exploring the data at a higher hierarchy level is recommended. ' + 'Do you wish to continue?', QtGui.QMessageBox.Yes, QtGui.QMessageBox.No) QtGui.QApplication.instance().restoreOverrideCursor() if reply == QtGui.QMessageBox.No: self.emptyAxis() return # *** Colour of plot elements axesColour = str(self.preferences['Axes colour'].name()) group1Colour = str( self.preferences['Group colours'][profile.groupName1].name()) group2Colour = str( self.preferences['Group colours'][profile.groupName2].name()) # *** Set sample names self.groupName1 = profile.groupName1 self.groupName2 = profile.groupName2 # *** Create lists for each quantity of interest and calculate spread of data groupData1, groupData2 = profile.getFeatureProportionsAll() features = profile.getFeatures() field1 = [] field2 = [] xSpread = [] ySpread = [] for i in xrange(0, len(groupData1)): mean1 = mean(groupData1[i]) mean2 = mean(groupData2[i]) field1.append(mean1) field2.append(mean2) if self.spreadMethod == 'standard deviation': xSpread.append([ max(mean1 - stdDev(groupData1[i], mean1), 0), min(mean1 + stdDev(groupData1[i], mean1), 100) ]) ySpread.append([ max(mean2 - stdDev(groupData2[i], mean2), 0), min(mean2 + stdDev(groupData2[i], mean2), 100) ]) elif self.spreadMethod == '2 * standard deviation': xSpread.append([ max(mean1 - 2 * stdDev(groupData1[i], mean1), 0), min(mean1 + 2 * stdDev(groupData1[i], mean1), 100) ]) ySpread.append([ max(mean2 - 2 * stdDev(groupData2[i], mean2), 0), min(mean2 + 2 * stdDev(groupData2[i], mean2), 100) ]) elif self.spreadMethod == '25th and 75th percentile': spread1 = mquantiles(groupData1[i], prob=[0.25, 0.75]) spread2 = mquantiles(groupData2[i], prob=[0.25, 0.75]) xSpread.append([max(spread1[0], 0), min(spread1[1], 100)]) ySpread.append([max(spread2[0], 0), min(spread2[1], 100)]) elif self.spreadMethod == '9th and 91st percentile': spread1 = mquantiles(groupData1[i], prob=[0.09, 0.91]) spread2 = mquantiles(groupData2[i], prob=[0.09, 0.91]) xSpread.append([max(spread1[0], 0), min(spread1[1], 100)]) ySpread.append([max(spread2[0], 0), min(spread2[1], 100)]) elif self.spreadMethod == '2nd and 98th percentile': spread1 = mquantiles(groupData1[i], prob=[0.02, 0.98]) spread2 = mquantiles(groupData2[i], prob=[0.02, 0.98]) xSpread.append([max(spread1[0], 0), min(spread1[1], 100)]) ySpread.append([max(spread2[0], 0), min(spread2[1], 100)]) elif self.spreadMethod == 'minimum and maximum': xSpread.append([max(groupData1[i]), min(groupData1[i])]) ySpread.append([max(groupData2[i]), min(groupData2[i])]) # *** Set figure size self.fig.clear() self.fig.set_size_inches(self.figWidth, self.figHeight) if self.bShowHistograms: histogramSizeX = self.histogramSize / self.figWidth histogramSizeY = self.histogramSize / self.figHeight else: histogramSizeX = 0.0 histogramSizeY = 0.0 padding = 0.1 # inches xOffsetFigSpace = (0.4 + padding) / self.figWidth yOffsetFigSpace = (0.3 + padding) / self.figHeight axesScatter = self.fig.add_axes([ xOffsetFigSpace, yOffsetFigSpace, 1.0 - xOffsetFigSpace - histogramSizeX - (2 * padding) / self.figWidth, 1.0 - yOffsetFigSpace - histogramSizeY - (2 * padding) / self.figHeight ]) if self.bShowHistograms: axesTopHistogram = self.fig.add_axes([ xOffsetFigSpace, 1.0 - histogramSizeY - padding / self.figHeight, 1.0 - xOffsetFigSpace - histogramSizeX - (2 * padding) / self.figWidth, histogramSizeY ]) axesRightHistogram = self.fig.add_axes([ 1.0 - histogramSizeX - padding / self.figWidth, yOffsetFigSpace, histogramSizeX, 1.0 - yOffsetFigSpace - histogramSizeY - (2 * padding) / self.figHeight ]) # *** Handle mouse events tooltips = [] for i in xrange(0, len(field1)): tooltip = features[i] + '\n\n' tooltip += (self.groupName1 + ' mean proportion: %.3f' % field1[i]) + '\n' tooltip += (self.groupName2 + ' mean proportion: %.3f' % field2[i]) + '\n\n' tooltip += 'Difference between mean proportions (%): ' + ( '%.3f' % (field1[i] - field2[i])) + '\n' if field2[i] != 0: tooltip += 'Ratio of mean proportions: %.3f' % (field1[i] / field2[i]) else: tooltip += 'Ratio of mean proportions: undefined' if statsResults.profile != None: pValue = statsResults.getFeatureStatisticAsStr( features[i], 'pValues') pValueCorrected = statsResults.getFeatureStatisticAsStr( features[i], 'pValuesCorrected') tooltip += '\n\n' tooltip += 'p-value: ' + pValue + '\n' tooltip += 'Corrected p-value: ' + pValueCorrected tooltips.append(tooltip) self.plotEventHandler = PlotEventHandler(field1, field2, tooltips) self.mouseEventCallback(self.plotEventHandler) # *** Calculate R^2 value slope, intercept, r_value, p_value, std_err = linregress( field1, field2) # *** Plot data # set visual properties of all points colours = [] highlightedField1 = [] highlightedField2 = [] highlighColours = [] for i in xrange(0, len(field1)): if field1[i] > field2[i]: colours.append(group1Colour) else: colours.append(group2Colour) if features[i] in self.preferences['Highlighted group features']: highlightedField1.append(field1[i]) highlightedField2.append(field2[i]) highlighColours.append(colours[i]) # scatter plot axesScatter.scatter(field1, field2, c=colours, s=self.markerSize, zorder=5) if len(highlightedField1) > 0: axesScatter.scatter(highlightedField1, highlightedField2, c=highlighColours, s=self.markerSize, edgecolors='red', linewidth=2, zorder=10) # plot CIs if self.spreadMethod != 'None': xlist = [] ylist = [] for i in xrange(0, len(field1)): # horizontal CIs xlist.append(xSpread[i][0]) xlist.append(xSpread[i][1]) xlist.append(None) ylist.append(field2[i]) ylist.append(field2[i]) ylist.append(None) # vertical CIs xlist.append(field1[i]) xlist.append(field1[i]) xlist.append(None) ylist.append(ySpread[i][0]) ylist.append(ySpread[i][1]) ylist.append(None) axesScatter.plot(xlist, ylist, '-', color='gray', antialiased=False) # plot y=x line maxProportion = max(max(field1), max(field2)) * 1.05 axesScatter.plot([0, maxProportion], [0, maxProportion], color=axesColour, linestyle='dashed', marker='', zorder=1) axesScatter.set_xlabel(self.groupName1 + ' (%)') axesScatter.set_ylabel(self.groupName2 + ' (%)') if self.bShowR2: axesScatter.text(0.02, 0.98, r'R$^2$ = ' + ('%0.3f' % r_value**2), horizontalalignment='left', verticalalignment='top', transform=axesScatter.transAxes) axesScatter.set_xlim(0, maxProportion) axesScatter.set_ylim(0, maxProportion) # *** Prettify scatter plot for line in axesScatter.yaxis.get_ticklines(): line.set_color(axesColour) for line in axesScatter.xaxis.get_ticklines(): line.set_color(axesColour) for loc, spine in axesScatter.spines.iteritems(): spine.set_color(axesColour) # plot histograms if not self.bShowHistograms: for a in axesScatter.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesScatter.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesScatter.yaxis.get_ticklines(): line.set_color(axesColour) for line in axesScatter.xaxis.get_ticklines(): line.set_color(axesColour) for loc, spine in axesScatter.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(axesColour) else: # show histograms # plot top histogram axesTopHistogram.xaxis.set_major_formatter(NullFormatter()) pdf, bins, patches = axesTopHistogram.hist(field1, bins=self.numBins, facecolor=group1Colour) axesTopHistogram.set_xlim(axesScatter.get_xlim()) axesTopHistogram.set_yticks([0, max(pdf)]) axesTopHistogram.set_ylim([0, max(pdf) * 1.05]) # plot right histogram axesRightHistogram.yaxis.set_major_formatter(NullFormatter()) pdf, bins, patches = axesRightHistogram.hist( field2, bins=self.numBins, orientation='horizontal', facecolor=group2Colour) axesRightHistogram.set_ylim(axesScatter.get_ylim()) axesRightHistogram.set_xticks([0, max(pdf)]) axesRightHistogram.set_xlim([0, max(pdf) * 1.05]) # *** Prettify histogram plot for a in axesTopHistogram.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesTopHistogram.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesTopHistogram.yaxis.get_ticklines(): line.set_color(axesColour) for line in axesTopHistogram.xaxis.get_ticklines(): line.set_color(axesColour) for loc, spine in axesTopHistogram.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(axesColour) for a in axesRightHistogram.yaxis.majorTicks: a.tick1On = True a.tick2On = False for a in axesRightHistogram.xaxis.majorTicks: a.tick1On = True a.tick2On = False for line in axesRightHistogram.yaxis.get_ticklines(): line.set_color(axesColour) for line in axesRightHistogram.xaxis.get_ticklines(): line.set_color(axesColour) for loc, spine in axesRightHistogram.spines.iteritems(): if loc in ['right', 'top']: spine.set_color('none') else: spine.set_color(axesColour) self.updateGeometry() self.draw()
results = [ coverageListDP, coverageListDPCC, coverageListNW, coverageListWoolf, coverageListGart, coverageListRP ] lengths = [ ciLengthDP, ciLengthDPCC, ciLengthNW, ciLengthWoolf, ciLengthGart, ciLengthRP ] methodNames = [ 'DP: Asymptotic', 'DP: Asymptotic-CC', 'Newcombe-Wilson', 'Woolf', 'Gart', 'RP: Asympototic' ] for i in xrange(0, len(results)): coverageMeanStr = '%.2f' % mean(results[i]) coverageSdStr = '%.2f' % stdDev(results[i]) coverageMinStr = '%.2f' % min(results[i]) coverageMaxStr = '%.2f' % max(results[i]) lengthMeanStr = '%.2f' % mean(lengths[i]) lengthSdStr = '%.2f' % stdDev(lengths[i]) fout.write(methodNames[i] + '\n') fout.write(coverageMeanStr + '+/-' + coverageSdStr + '[' + coverageMinStr + ';' + coverageMaxStr + ']\n') fout.write(lengthMeanStr + '+/-' + lengthSdStr + '\n') fout.write('\n') fout.close()
def run(self, statTest, testType, confIntervMethod, coverage, profile, progress = None): self.results.test = statTest.name self.results.testType = testType self.results.alpha = 1.0 - coverage self.results.confIntervMethod = confIntervMethod self.results.profile = profile if progress == 'Verbose': print ' Processing feature:' self.results.data = [] index = 0 # calculate statistics seqsGroup1 = [] seqsGroup2 = [] parentSeqsGroup1 = [] parentSeqsGroup2 = [] pValues = [] lowerCIs = [] upperCIs = [] effectSizes = [] notes = [] if statTest.bSingleFeatureInterface: # process features one at a time for feature in profile.getFeatures(): if progress == 'Verbose': print ' ' + feature elif progress != None: if progress.wasCanceled(): self.results.data = [] return index += 1 progress.setValue(index) # get statistics seqGroup1, seqGroup2 = profile.getFeatureCounts(feature) parentSeqGroup1, parentSeqGroup2= profile.getParentFeatureCounts(feature) results = statTest.run(seqGroup1, seqGroup2, parentSeqGroup1, parentSeqGroup2, confIntervMethod, coverage) pValueOneSided, pValueTwoSided, lowerCI, upperCI, effectSize, note = results if testType == 'One-sided': pValue = pValueOneSided elif testType == 'Two-sided': pValue = pValueTwoSided else: print 'Error: Unknown test type.' # record results seqsGroup1.append(seqGroup1) seqsGroup2.append(seqGroup2) parentSeqsGroup1.append(parentSeqGroup1) parentSeqsGroup2.append(parentSeqGroup2) pValues.append(pValue) lowerCIs.append(lowerCI) upperCIs.append(upperCI) effectSizes.append(effectSize) notes.append(note) if progress != None and progress != 'Verbose': index += 1 progress.setValue(index) else: # process all features at once seqsGroup1, seqsGroup2 = profile.getFeatureCountsAll() parentSeqsGroup1, parentSeqsGroup2= profile.getParentFeatureCountsAll() pValuesOneSided, pValuesTwoSided, lowerCIs, upperCIs, effectSizes, notes = statTest.runAll(seqsGroup1, seqsGroup2, parentSeqsGroup1, parentSeqsGroup2, confIntervMethod, coverage, progress) if progress == 'Verbose': print ' Processing all features...' elif progress != None and progress.wasCanceled(): self.results.data = [] return if testType == 'One-sided': pValues = pValuesOneSided elif testType == 'Two-sided': pValues = pValuesTwoSided else: print 'Error: Unknown test type.' # record statistics features = profile.getFeatures() for i in xrange(0, len(features)): propGroup1 = [] for j in xrange(0, len(seqsGroup1[i])): sg1 = seqsGroup1[i][j] psg1 = parentSeqsGroup1[i][j] if psg1 > 0: propGroup1.append( sg1 * 100.0 / psg1 ) else: propGroup1.append( 0.0 ) propGroup2 = [] for j in xrange(0, len(seqsGroup2[i])): sg2 = seqsGroup2[i][j] psg2 = parentSeqsGroup2[i][j] if psg2 > 0: propGroup2.append( sg2 * 100.0 / psg2 ) else: propGroup2.append( 0.0 ) meanGroup1 = mean(propGroup1) meanGroup2 = mean(propGroup2) row = [features[i], meanGroup1, stdDev(propGroup1, meanGroup1), meanGroup2, stdDev(propGroup2, meanGroup2), float(pValues[i]),float(pValues[i]),float(effectSizes[i]), float(lowerCIs[i]),float(upperCIs[i]), notes[i]] for j in xrange(0, len(seqsGroup1[i])): row.append(seqsGroup1[i][j]) row.append(parentSeqsGroup1[i][j]) if parentSeqsGroup1[i][j] > 0: row.append(seqsGroup1[i][j] * 100.0 / parentSeqsGroup1[i][j]) else: row.append(0.0) for j in xrange(0, len(seqsGroup2[i])): row.append(seqsGroup2[i][j]) row.append(parentSeqsGroup2[i][j]) if parentSeqsGroup2[i][j] > 0: row.append(seqsGroup2[i][j] * 100.0 / parentSeqsGroup2[i][j]) else: row.append(0.0) self.results.data.append(row) headingsSampleStats = [] for sampleName in (profile.samplesInGroup1 + profile.samplesInGroup2): headingsSampleStats.append(sampleName) headingsSampleStats.append(sampleName + ': parent seq. count') headingsSampleStats.append(sampleName + ': rel. freq. (%)') self.results.createTableHeadings(profile.groupName1, profile.groupName2, headingsSampleStats) # sort results according to p-values if len(self.results.data) >= 1: self.results.data = TableHelper.SortTable(self.results.data, [self.results.dataHeadings['pValues']])
def run(self, confIntervMethod, coverage, tables, trials, bootstrapRep, progress): tableData = [] index = 0 for row in tables: feature = row[0] seq1 = row[1] seq2 = row[2] parentSeq1 = row[3] parentSeq2 = row[4] lowerCI, upperCI, obsEffectSize = confIntervMethod.run( seq1, seq2, parentSeq1, parentSeq2, coverage) p1 = float(seq1) / parentSeq1 p2 = float(seq2) / parentSeq2 coverageList = [] coverageListLess5 = [] coverageListGreater5 = [] for trial in xrange(0, trials): if progress != '': index += 1 progress.setValue(index) progress.setLabelText(feature + ' - Trial = ' + str(trial)) containedRep = 0 for dummy in xrange(0, bootstrapRep): c1 = binomial(parentSeq1, p1) c2 = binomial(parentSeq2, p2) lowerCI, upperCI, effectSize = confIntervMethod.run( c1, c2, parentSeq1, parentSeq2, coverage) if obsEffectSize >= lowerCI and obsEffectSize <= upperCI: containedRep += 1 if min([seq1, seq2]) <= 5: coverageListLess5.append( float(containedRep) / bootstrapRep) else: coverageListGreater5.append( float(containedRep) / bootstrapRep) coverageList.append(float(containedRep) / bootstrapRep) row = [] row.append(feature) row.append(seq1) row.append(seq2) row.append(parentSeq1) row.append(parentSeq2) row.append(float(seq1) / parentSeq1) row.append(float(seq2) / parentSeq2) row.append(mean(coverageList)) row.append(stdDev(coverageList)) if math.isnan(mean(coverageListLess5)): row.append('') else: row.append(mean(coverageListLess5)) if math.isnan(stdDev(coverageListLess5)): row.append('') else: row.append(stdDev(coverageListLess5)) if math.isnan(mean(coverageListGreater5)): row.append('') else: row.append(mean(coverageListGreater5)) if math.isnan(stdDev(coverageListGreater5)): row.append('') else: row.append(stdDev(coverageListGreater5)) tableData.append(row) return tableData
def run(self, test, signLevel, statsResults, trials, bootstrapRep, progress): tableData = [] index = 0 for row in statsResults: feature = row[0] seq1 = row[1] seq2 = row[2] parentSeq1 = row[3] parentSeq2 = row[4] p1 = float(seq1) / parentSeq1 p2 = float(seq2) / parentSeq2 powerList = [] powerListLess5 = [] powerListGreater5 = [] for trial in xrange(0, trials): if progress != '': index += 1 progress.setValue(index) progress.setLabelText(feature + ' - Trial = ' + str(trial)) power = 0 processedReplicates = 0 for dummy in xrange(0, bootstrapRep): c1 = 0 c2 = 0 for dummy in xrange(0, parentSeq1): rnd = random.random() if rnd <= p1: c1 += 1 for dummy in xrange(0, parentSeq2): rnd = random.random() if rnd <= p2: c2 += 1 if c1 == 0 and c2 == 0: # This is a special case that many hypothesis test will not handle correctly # so we just ignore it. This will have little effect on the calculated power # of a test. continue processedReplicates += 1 pValueOneSided, pValueTwoSided = test.hypothesisTest(c1, c2, parentSeq1, parentSeq2) if pValueTwoSided < signLevel: power += 1 if processedReplicates > 0: if min([seq1,seq2]) <= 5: powerListLess5.append(float(power) / processedReplicates) else: powerListGreater5.append(float(power) / processedReplicates) powerList.append(float(power) / processedReplicates) row = [] row.append(feature) row.append(seq1) row.append(seq2) row.append(parentSeq1) row.append(parentSeq2) row.append(float(seq1) / parentSeq1) row.append(float(seq2) / parentSeq2) row.append(mean(powerList)) row.append(stdDev(powerList)) if math.isnan(mean(powerListLess5)): row.append('') else: row.append(mean(powerListLess5)) if math.isnan(stdDev(powerListLess5)): row.append('') else: row.append(stdDev(powerListLess5)) if math.isnan(mean(powerListGreater5)): row.append('') else: row.append(mean(powerListGreater5)) if math.isnan(stdDev(powerListGreater5)): row.append('') else: row.append(stdDev(powerListGreater5)) tableData.append(row) return tableData
ciLengthRP.append(upperCI - lowerCI) coverageListDP.append(float(containedRepDP) / replicates) coverageListDPCC.append(float(containedRepDPCC) / replicates) coverageListNW.append(float(containedRepNW) / replicates) coverageListWoolf.append(float(containedRepWoolf) / replicates) coverageListGart.append(float(containedRepGart) / replicates) coverageListRP.append(float(containedRepRP) / replicates) results = [coverageListDP, coverageListDPCC, coverageListNW, coverageListWoolf, coverageListGart, coverageListRP] lengths = [ciLengthDP,ciLengthDPCC,ciLengthNW,ciLengthWoolf,ciLengthGart,ciLengthRP] methodNames = ['DP: Asymptotic', 'DP: Asymptotic-CC', 'Newcombe-Wilson', 'Woolf', 'Gart', 'RP: Asympototic'] for i in xrange(0, len(results)): coverageMeanStr = '%.2f' % mean(results[i]) coverageSdStr = '%.2f' % stdDev(results[i]) coverageMinStr = '%.2f' % min(results[i]) coverageMaxStr = '%.2f' % max(results[i]) lengthMeanStr = '%.2f' % mean(lengths[i]) lengthSdStr = '%.2f' % stdDev(lengths[i]) fout.write(methodNames[i] + '\n') fout.write(coverageMeanStr + '+/-' + coverageSdStr + '[' + coverageMinStr + ';' + coverageMaxStr + ']\n') fout.write(lengthMeanStr + '+/-' + lengthSdStr+ '\n') fout.write('\n') fout.close()
def run(self, statTest, testType, confIntervMethod, coverage, profile, progress=None): self.results.test = statTest.name self.results.testType = testType self.results.alpha = 1.0 - coverage self.results.confIntervMethod = confIntervMethod self.results.profile = profile if progress == 'Verbose': print ' Processing feature:' self.results.data = [] index = 0 # calculate statistics seqsGroup1 = [] seqsGroup2 = [] parentSeqsGroup1 = [] parentSeqsGroup2 = [] pValues = [] lowerCIs = [] upperCIs = [] effectSizes = [] notes = [] if statTest.bSingleFeatureInterface: # process features one at a time for feature in profile.getFeatures(): if progress == 'Verbose': print ' ' + feature elif progress != None: if progress.wasCanceled(): self.results.data = [] return index += 1 progress.setValue(index) # get statistics seqGroup1, seqGroup2 = profile.getFeatureCounts(feature) parentSeqGroup1, parentSeqGroup2 = profile.getParentFeatureCounts( feature) results = statTest.run(seqGroup1, seqGroup2, parentSeqGroup1, parentSeqGroup2, confIntervMethod, coverage) pValueOneSided, pValueTwoSided, lowerCI, upperCI, effectSize, note = results if testType == 'One-sided': pValue = pValueOneSided elif testType == 'Two-sided': pValue = pValueTwoSided else: print 'Error: Unknown test type.' # record results seqsGroup1.append(seqGroup1) seqsGroup2.append(seqGroup2) parentSeqsGroup1.append(parentSeqGroup1) parentSeqsGroup2.append(parentSeqGroup2) pValues.append(pValue) lowerCIs.append(lowerCI) upperCIs.append(upperCI) effectSizes.append(effectSize) notes.append(note) if progress != None and progress != 'Verbose': index += 1 progress.setValue(index) else: # process all features at once seqsGroup1, seqsGroup2 = profile.getFeatureCountsAll() parentSeqsGroup1, parentSeqsGroup2 = profile.getParentFeatureCountsAll( ) pValuesOneSided, pValuesTwoSided, lowerCIs, upperCIs, effectSizes, notes = statTest.runAll( seqsGroup1, seqsGroup2, parentSeqsGroup1, parentSeqsGroup2, confIntervMethod, coverage, progress) if progress == 'Verbose': print ' Processing all features...' elif progress != None and progress.wasCanceled(): self.results.data = [] return if testType == 'One-sided': pValues = pValuesOneSided elif testType == 'Two-sided': pValues = pValuesTwoSided else: print 'Error: Unknown test type.' # record statistics features = profile.getFeatures() for i in xrange(0, len(features)): propGroup1 = [] for j in xrange(0, len(seqsGroup1[i])): sg1 = seqsGroup1[i][j] psg1 = parentSeqsGroup1[i][j] if psg1 > 0: propGroup1.append(sg1 * 100.0 / psg1) else: propGroup1.append(0.0) propGroup2 = [] for j in xrange(0, len(seqsGroup2[i])): sg2 = seqsGroup2[i][j] psg2 = parentSeqsGroup2[i][j] if psg2 > 0: propGroup2.append(sg2 * 100.0 / psg2) else: propGroup2.append(0.0) meanGroup1 = mean(propGroup1) meanGroup2 = mean(propGroup2) row = [ features[i], meanGroup1, stdDev(propGroup1, meanGroup1), meanGroup2, stdDev(propGroup2, meanGroup2), float(pValues[i]), float(pValues[i]), float(effectSizes[i]), float(lowerCIs[i]), float(upperCIs[i]), notes[i] ] for j in xrange(0, len(seqsGroup1[i])): row.append(seqsGroup1[i][j]) row.append(parentSeqsGroup1[i][j]) if parentSeqsGroup1[i][j] > 0: row.append(seqsGroup1[i][j] * 100.0 / parentSeqsGroup1[i][j]) else: row.append(0.0) for j in xrange(0, len(seqsGroup2[i])): row.append(seqsGroup2[i][j]) row.append(parentSeqsGroup2[i][j]) if parentSeqsGroup2[i][j] > 0: row.append(seqsGroup2[i][j] * 100.0 / parentSeqsGroup2[i][j]) else: row.append(0.0) self.results.data.append(row) headingsSampleStats = [] for sampleName in (profile.samplesInGroup1 + profile.samplesInGroup2): headingsSampleStats.append(sampleName) headingsSampleStats.append(sampleName + ': parent seq. count') headingsSampleStats.append(sampleName + ': rel. freq. (%)') self.results.createTableHeadings(profile.groupName1, profile.groupName2, headingsSampleStats) # sort results according to p-values if len(self.results.data) >= 1: self.results.data = TableHelper.SortTable( self.results.data, [self.results.dataHeadings['pValues']])
def run(self, confIntervMethod, coverage, tables, trials, bootstrapRep, progress): tableData = [] index = 0 for row in tables: feature = row[0] seq1 = row[1] seq2 = row[2] parentSeq1 = row[3] parentSeq2 = row[4] lowerCI, upperCI, obsEffectSize = confIntervMethod.run(seq1, seq2, parentSeq1, parentSeq2, coverage) p1 = float(seq1) / parentSeq1 p2 = float(seq2) / parentSeq2 coverageList = [] coverageListLess5 = [] coverageListGreater5 = [] for trial in xrange(0, trials): if progress != '': index += 1 progress.setValue(index) progress.setLabelText(feature + ' - Trial = ' + str(trial)) containedRep = 0 for dummy in xrange(0, bootstrapRep): c1 = binomial(parentSeq1, p1) c2 = binomial(parentSeq2, p2) lowerCI, upperCI, effectSize = confIntervMethod.run(c1, c2, parentSeq1, parentSeq2, coverage) if obsEffectSize >= lowerCI and obsEffectSize <= upperCI: containedRep += 1 if min([seq1,seq2]) <= 5: coverageListLess5.append(float(containedRep) / bootstrapRep) else: coverageListGreater5.append(float(containedRep) / bootstrapRep) coverageList.append(float(containedRep) / bootstrapRep) row = [] row.append(feature) row.append(seq1) row.append(seq2) row.append(parentSeq1) row.append(parentSeq2) row.append(float(seq1) / parentSeq1) row.append(float(seq2) / parentSeq2) row.append(mean(coverageList)) row.append(stdDev(coverageList)) if math.isnan(mean(coverageListLess5)): row.append('') else: row.append(mean(coverageListLess5)) if math.isnan(stdDev(coverageListLess5)): row.append('') else: row.append(stdDev(coverageListLess5)) if math.isnan(mean(coverageListGreater5)): row.append('') else: row.append(mean(coverageListGreater5)) if math.isnan(stdDev(coverageListGreater5)): row.append('') else: row.append(stdDev(coverageListGreater5)) tableData.append(row) return tableData