예제 #1
0
def fishers(ocean, conn, interval, zone, zoneExpected, zoneObserved, globalCount):
    """ The null hypothesis is that the relative proportions of one variable are independent
        of the second variable. For example, if you counted the number of male and female mice
        in two barns, the null hypothesis would be that the proportion of male mice is the
        same in the two barns.
        http://udel.edu/~mcdonald/statfishers.html
        inputs: zone expected, zone observed, not zone expected, not zone observed
                in zone     not in zone
    observed    a           b
    expected    c           d
    """
    notZoneExpected = globalCount - zoneExpected
    notZoneObserved = globalCount - zoneObserved

    if zoneExpected < 1 and zoneObserved == 0:
        print("** values too low")
    else:
        odds, pval = fisher_exact(np.array([[zoneObserved, notZoneObserved], [zoneExpected, notZoneExpected]]))
        fishers = robjects.r['fisher.test']
        res_r = fishers(np.array([[zoneObserved, notZoneObserved], [zoneExpected, notZoneExpected]]))
        r_p = res_r[0][0]
        r_odds = res_r[2][0]
        sql = "insert into fisherResults (ocean, zone, period, fisher, sig) values('" + ocean + "', '" + zone.get('name') + "', '" + str(interval) + "', " + formatOdds(r_odds) + ", " + str(r_p) + ")"
        utils.executeMysql_All(conn, sql)
        print("** Fishers Exact: %s: odds: %s, r_odds: %s, r_p: %s, p: %s" % (zone.get('name'), formatOdds(odds), r_odds, r_p, pval))
예제 #2
0
def getCountOfExtraCoords(conn, zone, ocean, interval):
    coordinatesList = zone.get('extraCoords')
    totalCount = 0
    for t in coordinatesList:
        rec = utils.executeMysql_All(conn, buildQuery(ocean, newZone("na", t[0], t[1], t[2], t[3]), 0))
        totalCount += len(rec)
    return totalCount
예제 #3
0
def exportCohorts(cohorts, ocean='pacific'):
    conn = connection.new()

    if ocean.lower() == 'atlantic':
        sql = ATL_TEMPLATE
    else:
        sql = PAC_TEMPLATE

    for cohort in cohorts:
        startDate = cohort[0]
        endDate = cohort[1]
        timeRange = str(cohort[2])

        query = sql.replace('$START_DATE$', startDate).replace('$END_DATE$', endDate).replace('$RANGE$', timeRange).replace('$PATH$', "/tmp")
        #print query
        utils.executeMysql_All(conn, query)
예제 #4
0
파일: main.py 프로젝트: jimclouse/drifters
def main():
    utils.mkdir("../data/drifters")

    conn = connection.new()

    sqlDrifterExport = """ SELECT id, obsDateTime, obsDate, obsTime, latitude, longitude, longitudeWest, hasDrogue
                        from gdpAll where obsTime = '6:00' and obsDate = '$$date'
                        and latitude between 15 and 40 and longitude <= 82 and longitude >= 31;"""

    start_date = date(2008, 12, 4)
    end_date = date(2009, 4, 30)

    for single_date in daterange(start_date, end_date):
        exportDate = strftime("%Y-%m-%d", single_date.timetuple())
        sql = sqlDrifterExport.replace("$$date", exportDate)
        drifters = utils.executeMysql_All(conn, sql)
        writeDriftersToFile(exportDate, drifters)
예제 #5
0
def adjust(dataPath, basin):
    conn = connection.new()
    fileName = os.path.join(dataPath, "gdp" + basin + "All_adjusted.csv")
    rFile = open(fileName, 'w')
    dbTable = 'gdp' + basin + 'All'

    sql = """ SELECT g.id, g.obsDate, g.obsTime, latitude, longitude
                from (SELECT id, ObsDate, max(ObsTime) as ObsTime from $dbTable$
                group by id, obsDate) as i join $dbTable$ g
                on i.id = g.id and i.obsDate = g.obsDate and i.obsTime = g.obsTime
                where g.obsDate >= '2000-01-01'
                order by g.id, g.ObsDate
        """
    print sql
    sql = sql.replace('$dbTable$', dbTable)
    print "running query"
    results = utils.executeMysql_All(conn, sql)
    print "query complete. processing"
    prevId = None
    prevDate = None
    seriesList = ['x', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
    for row in results:
        id = row['id']
        obsDate = row['obsDate']
        obsTime = row['obsTime']
        Lat = row['latitude']
        Long = row['longitude']
        # reset on each new Id
        if not id == prevId:
            prevId = id
            prevDate = None
            newId = id
            seriesIndex = 0

        # look for gaps
        if (not prevDate is None):
            diff = obsDate - prevDate
            if diff.days > MAX_GAP:
                print("gap! %i days") % (diff.days)
                seriesIndex += 1
                newId = str(id) + seriesList[seriesIndex]
        prevDate = obsDate
        rFile.write('%s, %s, %s, %s, %s\n' % (newId, obsDate, obsTime, Lat, Long))

        sql = "alter table " + dbTable + " add index(id)"
        utils.executeMysql_Command(conn, sql)
예제 #6
0
def compare(zoneList, ocean='pacific'):
    """ creates a one-way chi-squared table of all zones for each interval
        1. for each zone, get the percentage of all drifters that exist in it for the baseline
            a. find the total number
            b. loop through all zones and compute ratios
            c. this is the expected ratio
        2. do the same for each time period
            a. find the total number of drifters alive
            b. compute the ratio of drifters in each zone
            c. this is the observed ratio
            d. use the expected ratio and observed ratio to calculate expected and observed counts for each zone

        this results in a 9x2 table
                z1 | z2 | z3 | ...
        obs |   x  | ...
        exp |   y  | ...

        Null Hypothesis: Each zone has an evenly distributed probability of occuring
        Alt Hypothesis: At least one of the proportions is different from predicted

    """
    conn = connection.new()
    #lifetable = buildLifeTable(ocean.lower())
    if ocean.lower() == 'atlantic':
        ocean = 'gdpAtlAdj'
    else:
        ocean = 'gdpPacAdj'

    # clean fishers results table
    sql = "delete from fisherResults where ocean = '" + ocean + "';"
    utils.executeMysql_All(conn, sql)
    sql = "delete from chiSquareResults where ocean = '" + ocean + "';"
    utils.executeMysql_All(conn, sql)
    sql = "delete from chiResiduals where ocean = '" + ocean + "';"
    utils.executeMysql_All(conn, sql)
    sql = "delete from percentages where ocean = '" + ocean + "';"
    utils.executeMysql_All(conn, sql)

    # get baseline data for each zone
    for zone in zoneList:
        zone["baselineCount"] = len(utils.executeMysql_All(conn, buildQuery(ocean, zone, 0)))
        if zone.get('extraCoords'):
            zone["baselineCount"] = zone.get("baselineCount", 0) + getCountOfExtraCoords(conn, zone, ocean, 0)

    # get total drifter count
    baselineDrifterCount = 0
    for zone in zoneList:
        baselineDrifterCount = baselineDrifterCount + zone.get("baselineCount")

    # define baseline ratios
    for zone in zoneList:
        zone["baselineRatio"] = zone.get("baselineCount") / float(baselineDrifterCount)

    # loop through intervals, running chi-square at each interval
    for interval in INTERVALS:
        print("********************************")
        print("** %s Day Interval Summary **" % (interval))
        expected = []
        observed = []

        # get interval zone counts
        for zone in zoneList:
            zone["intervalObserved"] = len(utils.executeMysql_All(conn, buildQuery(ocean, zone, interval)))
            if zone.get('extraCoords'):
                zone["intervalObserved"] = zone.get("intervalObserved", 0) + getCountOfExtraCoords(conn, zone, ocean, interval)

        # get total interval count
        totalObserved = 0
        for zone in zoneList:
            totalObserved = totalObserved + zone.get("intervalObserved")

        # collect obs & exp data into list for each zone
        for zone in zoneList:
            intervalExpected = totalObserved * zone.get("baselineRatio")
            zone["intervalExpected"] = intervalExpected
            #if observedValue >= 5 and expectedValue >= 5:
            expected.append(intervalExpected)
            observed.append(zone.get("intervalObserved"))
            print('%s: baseline ratio: %f, no. drifters: %i, expected: %f, observed: %f'
                  % (zone.get('name'), zone.get("baselineRatio"), totalObserved, intervalExpected, zone.get("intervalObserved")))
            sql = "insert into chiResiduals (ocean, period, periodN, zone, residual) values('" + ocean + "', '" + str(interval) + "', " + str(totalObserved) + ", '" + zone.get("name").strip() + "', " + str(zone.get("intervalObserved") - intervalExpected) + ")"
            utils.executeMysql_All(conn, sql)

            sql = "insert into percentages (ocean, period, periodN, zone, percent) values('" + ocean + "', '" + str(interval) + "', " + str(totalObserved) + ", '" + zone.get("name").strip() + "', " + str((zone.get("intervalObserved") / float(totalObserved))) + ")"
            utils.executeMysql_All(conn, sql)

        # perform and report on chi-square
        if len(expected) == 0:
            print("** No data for interval %s available" % (interval))
        else:
            chi = chisquare(np.array(observed), np.array(expected))
            print("** Chi-Squared Statistic: %f, p=%s" % (chi[0], chi[1]))
            sql = "insert into chiSquareResults (ocean, period, n, chiStat, sig, sigStr) values('" + ocean + "', '" + str(interval) + "', " + str(totalObserved) + ", " + str(chi[0]) + ", " + str("{0:.2f}".format(float(chi[1]))) + ", '" + str(float(chi[1])) + "')"
            utils.executeMysql_All(conn, sql)
        print("**")

        for zone in zoneList:
            # perform Fisher's Exact Test on each zone to determine differnce with rest of data
            fishers(ocean, conn, interval, zone, zone.get("intervalExpected"), zone.get("intervalObserved"), baselineDrifterCount)

        print("\n\n")