def findRange(fld): """ finds the range of data over a field. That is, it finds the min and max of the field and then returns max - min. """ vals = samples.getAllFlds(fld) return max(vals) - min(vals)
def isLinearGrowth(fld, minRange): """ This function looks at how linearly fld grows. The closer it can come to a straight line that goes through all the values of fld (assuming even growth along the other axis), the higher th confidence. """ samples.sampleList.sort(key=lambda x: samples.extractField(x, fld)) plot = __getPlot('id', fld) if (samples.sampleList[-1][fld] - samples.sampleList[0][fld]) < minRange: return SimResult(confidence.Confidence(confidence.Applic.cf, confidence.Validity.sound), "even distribution of sample property '" + fld + "'", 'insufficient distribution of samples', plot) fldList = samples.getAllFlds(fld) if len(fldList) < 3: #can't check for *even* distribution, but they are not right next to each other #if we got here at all, I think. if len(fldList) == 2: app = -observations.neareq(fldList[0], fldList[1]) return SimResult(confidence.Confidence(app, confidence.Validity.plaus), "even distribution of sample property '" + fld + "'", '2 samples ' + (app.isTrue() and '' or 'not ') + 'about equal', plot) else: return SimResult(confidence.Confidence(confidence.Applic.df, confidence.Validity.prob), "even distribution of sample property '" + fld + "'", 'fewer than 2 samples', '') #fldList.sort() line = stats.linregress(range(len(fldList)), fldList) #line[0] is slope #line[1] is intercept qual = __getQuality(line[3]) if len(fldList) < 5: qual -= 1 conf = __getConfidence((.8, .85, .9, .95, .99), line[2], qual) plot.plotLine(line[0], line[1]) """ visDesc = "Graph of " + fld + " spaced out evenly, plus the best fit line" visDesc += "\npoints are:\n" visDesc += "\n".join([str(tup) for tup in zip(range(len(fldList)), fldList)]) visDesc += "\nLine is slope " + str(line[0]) + " intercept " + str(line[1]) """ """ visDesc += '\nfits ' + fld + ' within ' + str(line[2]) visDesc += '.\nStatistical significance: ' + str(line[3]) """ return SimResult(conf, "even distribution of sample property '" + fld + "'", "'" + fld + "' is " + (line[2] < .9 and 'not ' or '') + "evenly distributed among all samples", plot)
def correlated(fldA, fldB, dir): """ What this needs to do is identify whether there is some trend between fldA and fldB in the appropriate direction. If dir is positive, this is a direct correlation; if it is negative, it is an inverse correlation. """ correlation = stats.pearsonr(samples.getAllFlds(fldA), samples.getAllFlds(fldB)) conf = __getConfidence((-.1, .2, .5, .7, .85), correlation[0] * dir, __getQuality(correlation[1] / 2)) plot = __getPlot(fldA, fldB) return SimResult(conf, (dir > 0 and "positive" or "negative") + " correlation between " + fldA + " and " + fldB, (abs(correlation[0]) < .5 and 'minimal' or (correlation[0] > 0 and "positive" or "negative")) + " correlation between " + fldA + " and " + fldB + '; significance: ' + str(correlation[1]), plot)
def skewsField(sample, field): """ Checks whether the value of field in the passed in sample is significantly different from the value of field for the rest of the samples under consideration. """ savedSamples = samples.sampleList[:] samples.sampleList.remove(sample) try: flds = samples.getAllFlds(field) mean = stats.mean(flds) stddev = stats.std(flds) val = sample[field] if stddev == 0: devs = 0 else: devs = abs(val - mean) / stddev finally: #we should be fixing the sample list even when I crash! samples.sampleList = savedSamples if len(samples.sampleList) < 3: qual = confidence.Validity.plaus elif len(samples.sampleList) < 6: qual = confidence.Validity.prob else: qual = confidence.Validity.sound conf = __getConfidence((.5, 1, 2, 3, 5), devs, qual) samples.sampleList.sort(key=lambda x: samples.extractField(x, field)) plot = __getPlot('id', field) plot.plotLine(0, mean) plot.plotLine(0, mean-stddev) plot.plotLine(0, mean+stddev) plot.plotLine(0, sample[field]) return SimResult(conf, str(sample) + " has a different " + field + " from other samples", str(sample) + "'s value for " + field + ' is ' + str(devs) + ' standard deviations from the mean', plot)