def fishers(ocean, conn, interval, zone, zoneExpected, zoneObserved, globalCount): """ The null hypothesis is that the relative proportions of one variable are independent of the second variable. For example, if you counted the number of male and female mice in two barns, the null hypothesis would be that the proportion of male mice is the same in the two barns. http://udel.edu/~mcdonald/statfishers.html inputs: zone expected, zone observed, not zone expected, not zone observed in zone not in zone observed a b expected c d """ notZoneExpected = globalCount - zoneExpected notZoneObserved = globalCount - zoneObserved if zoneExpected < 1 and zoneObserved == 0: print("** values too low") else: odds, pval = fisher_exact(np.array([[zoneObserved, notZoneObserved], [zoneExpected, notZoneExpected]])) fishers = robjects.r['fisher.test'] res_r = fishers(np.array([[zoneObserved, notZoneObserved], [zoneExpected, notZoneExpected]])) r_p = res_r[0][0] r_odds = res_r[2][0] sql = "insert into fisherResults (ocean, zone, period, fisher, sig) values('" + ocean + "', '" + zone.get('name') + "', '" + str(interval) + "', " + formatOdds(r_odds) + ", " + str(r_p) + ")" utils.executeMysql_All(conn, sql) print("** Fishers Exact: %s: odds: %s, r_odds: %s, r_p: %s, p: %s" % (zone.get('name'), formatOdds(odds), r_odds, r_p, pval))
def getCountOfExtraCoords(conn, zone, ocean, interval): coordinatesList = zone.get('extraCoords') totalCount = 0 for t in coordinatesList: rec = utils.executeMysql_All(conn, buildQuery(ocean, newZone("na", t[0], t[1], t[2], t[3]), 0)) totalCount += len(rec) return totalCount
def exportCohorts(cohorts, ocean='pacific'): conn = connection.new() if ocean.lower() == 'atlantic': sql = ATL_TEMPLATE else: sql = PAC_TEMPLATE for cohort in cohorts: startDate = cohort[0] endDate = cohort[1] timeRange = str(cohort[2]) query = sql.replace('$START_DATE$', startDate).replace('$END_DATE$', endDate).replace('$RANGE$', timeRange).replace('$PATH$', "/tmp") #print query utils.executeMysql_All(conn, query)
def main(): utils.mkdir("../data/drifters") conn = connection.new() sqlDrifterExport = """ SELECT id, obsDateTime, obsDate, obsTime, latitude, longitude, longitudeWest, hasDrogue from gdpAll where obsTime = '6:00' and obsDate = '$$date' and latitude between 15 and 40 and longitude <= 82 and longitude >= 31;""" start_date = date(2008, 12, 4) end_date = date(2009, 4, 30) for single_date in daterange(start_date, end_date): exportDate = strftime("%Y-%m-%d", single_date.timetuple()) sql = sqlDrifterExport.replace("$$date", exportDate) drifters = utils.executeMysql_All(conn, sql) writeDriftersToFile(exportDate, drifters)
def adjust(dataPath, basin): conn = connection.new() fileName = os.path.join(dataPath, "gdp" + basin + "All_adjusted.csv") rFile = open(fileName, 'w') dbTable = 'gdp' + basin + 'All' sql = """ SELECT g.id, g.obsDate, g.obsTime, latitude, longitude from (SELECT id, ObsDate, max(ObsTime) as ObsTime from $dbTable$ group by id, obsDate) as i join $dbTable$ g on i.id = g.id and i.obsDate = g.obsDate and i.obsTime = g.obsTime where g.obsDate >= '2000-01-01' order by g.id, g.ObsDate """ print sql sql = sql.replace('$dbTable$', dbTable) print "running query" results = utils.executeMysql_All(conn, sql) print "query complete. processing" prevId = None prevDate = None seriesList = ['x', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'] for row in results: id = row['id'] obsDate = row['obsDate'] obsTime = row['obsTime'] Lat = row['latitude'] Long = row['longitude'] # reset on each new Id if not id == prevId: prevId = id prevDate = None newId = id seriesIndex = 0 # look for gaps if (not prevDate is None): diff = obsDate - prevDate if diff.days > MAX_GAP: print("gap! %i days") % (diff.days) seriesIndex += 1 newId = str(id) + seriesList[seriesIndex] prevDate = obsDate rFile.write('%s, %s, %s, %s, %s\n' % (newId, obsDate, obsTime, Lat, Long)) sql = "alter table " + dbTable + " add index(id)" utils.executeMysql_Command(conn, sql)
def compare(zoneList, ocean='pacific'): """ creates a one-way chi-squared table of all zones for each interval 1. for each zone, get the percentage of all drifters that exist in it for the baseline a. find the total number b. loop through all zones and compute ratios c. this is the expected ratio 2. do the same for each time period a. find the total number of drifters alive b. compute the ratio of drifters in each zone c. this is the observed ratio d. use the expected ratio and observed ratio to calculate expected and observed counts for each zone this results in a 9x2 table z1 | z2 | z3 | ... obs | x | ... exp | y | ... Null Hypothesis: Each zone has an evenly distributed probability of occuring Alt Hypothesis: At least one of the proportions is different from predicted """ conn = connection.new() #lifetable = buildLifeTable(ocean.lower()) if ocean.lower() == 'atlantic': ocean = 'gdpAtlAdj' else: ocean = 'gdpPacAdj' # clean fishers results table sql = "delete from fisherResults where ocean = '" + ocean + "';" utils.executeMysql_All(conn, sql) sql = "delete from chiSquareResults where ocean = '" + ocean + "';" utils.executeMysql_All(conn, sql) sql = "delete from chiResiduals where ocean = '" + ocean + "';" utils.executeMysql_All(conn, sql) sql = "delete from percentages where ocean = '" + ocean + "';" utils.executeMysql_All(conn, sql) # get baseline data for each zone for zone in zoneList: zone["baselineCount"] = len(utils.executeMysql_All(conn, buildQuery(ocean, zone, 0))) if zone.get('extraCoords'): zone["baselineCount"] = zone.get("baselineCount", 0) + getCountOfExtraCoords(conn, zone, ocean, 0) # get total drifter count baselineDrifterCount = 0 for zone in zoneList: baselineDrifterCount = baselineDrifterCount + zone.get("baselineCount") # define baseline ratios for zone in zoneList: zone["baselineRatio"] = zone.get("baselineCount") / float(baselineDrifterCount) # loop through intervals, running chi-square at each interval for interval in INTERVALS: print("********************************") print("** %s Day Interval Summary **" % (interval)) expected = [] observed = [] # get interval zone counts for zone in zoneList: zone["intervalObserved"] = len(utils.executeMysql_All(conn, buildQuery(ocean, zone, interval))) if zone.get('extraCoords'): zone["intervalObserved"] = zone.get("intervalObserved", 0) + getCountOfExtraCoords(conn, zone, ocean, interval) # get total interval count totalObserved = 0 for zone in zoneList: totalObserved = totalObserved + zone.get("intervalObserved") # collect obs & exp data into list for each zone for zone in zoneList: intervalExpected = totalObserved * zone.get("baselineRatio") zone["intervalExpected"] = intervalExpected #if observedValue >= 5 and expectedValue >= 5: expected.append(intervalExpected) observed.append(zone.get("intervalObserved")) print('%s: baseline ratio: %f, no. drifters: %i, expected: %f, observed: %f' % (zone.get('name'), zone.get("baselineRatio"), totalObserved, intervalExpected, zone.get("intervalObserved"))) sql = "insert into chiResiduals (ocean, period, periodN, zone, residual) values('" + ocean + "', '" + str(interval) + "', " + str(totalObserved) + ", '" + zone.get("name").strip() + "', " + str(zone.get("intervalObserved") - intervalExpected) + ")" utils.executeMysql_All(conn, sql) sql = "insert into percentages (ocean, period, periodN, zone, percent) values('" + ocean + "', '" + str(interval) + "', " + str(totalObserved) + ", '" + zone.get("name").strip() + "', " + str((zone.get("intervalObserved") / float(totalObserved))) + ")" utils.executeMysql_All(conn, sql) # perform and report on chi-square if len(expected) == 0: print("** No data for interval %s available" % (interval)) else: chi = chisquare(np.array(observed), np.array(expected)) print("** Chi-Squared Statistic: %f, p=%s" % (chi[0], chi[1])) sql = "insert into chiSquareResults (ocean, period, n, chiStat, sig, sigStr) values('" + ocean + "', '" + str(interval) + "', " + str(totalObserved) + ", " + str(chi[0]) + ", " + str("{0:.2f}".format(float(chi[1]))) + ", '" + str(float(chi[1])) + "')" utils.executeMysql_All(conn, sql) print("**") for zone in zoneList: # perform Fisher's Exact Test on each zone to determine differnce with rest of data fishers(ocean, conn, interval, zone, zone.get("intervalExpected"), zone.get("intervalObserved"), baselineDrifterCount) print("\n\n")