/
grid_analysis.py
113 lines (92 loc) · 3.26 KB
/
grid_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import csv
import Bio.Phylo as bp
import metrics
import numpy as np
import random
import multiprocessing
import cPickle as pkl
import sys
import time
from data import *
# grid size for grouping communities, in degrees
GRID_SIZE = 1
# number of pairwise route comparisons to perform per grid cell
COMPARISONS = 1000
# ignore grid cells unless they have at least this many routes
MIN_SITES = 10
# read in route/species abundance information from FIA data file
grids = {}
all_species = set()
with open(input_file) as data_file:
reader = csv.reader(data_file)
# skip header row
next(reader)
for lat, lon, genus, species, count in reader:
lat, lon = float(lat), float(lon)
count = int(count)
if species == 'unknown': continue
species_name = '%s %s' % (genus, species)
species_node = find_species(species_name, tree)
if not species_node: continue
all_species.add(species_node)
route = (lat,lon)
grid = (int(lat), int(lon))
if not grid in grids:
grids[grid] = {}
routes = grids[grid]
if not route in routes:
routes[route] = {}
routes[route][species_node] = count
grids = {x:y for x, y in grids.iteritems() if len(y) >= MIN_SITES}
print len(grids), 'total grids'
# get the range of lat/lon values
lats, lons = [route[0] for route in routes], [route[1] for route in routes]
lat_range = (min(lats), max(lats))
lon_range = (min(lons), max(lons))
results = {}
def analyze(arg):
grid, routes = arg
species_pool = []
for route in routes.values():
for sp, count in route.iteritems():
species_pool += [sp] * count
n = len(routes)
# compare all combinations of routes if n choose 2 < COMPARISONS,
# otherwise compare random combinations until you reach COMBINATIONS
# total comparisons
comparisons = n*(n-1) / 2
if comparisons < COMPARISONS:
to_compare = ((r1, r2) for r1 in routes for r2 in routes if not r1 == r2)
else:
def random_comparison():
while True:
yield tuple(sorted(random.sample(routes.keys(), 2)))
to_compare = random_comparison()
# compare pairs of communities
comms = []
while len(comms) < min(COMPARISONS, comparisons):
try: r1, r2 = next(to_compare)
except StopIteration: break
if len(routes[r1]) < 2 or len(routes[r2]) < 2: continue
try:
result = metrics.process(routes[r1], routes[r2], tree, species_pool)
if not result: continue
comms.append(result)
except IndexError:
# this means a species wasn't found in our tree
pass
print '**', grid, time.strftime('%D %T')
results = {}
for result in sorted(set(comms)):
percent = 100*len([c for c in comms if c == result]) / float(len(comms))
print '%s: %s%%' % (result, percent)
results[result] = percent
return grid, results
if __name__ == '__main__':
results = dict([
result for result in
multiprocessing.Pool().map(analyze, grids.iteritems())
if result
])
with open('grid_results.pkl', 'w') as results_file:
pkl.dump(results, results_file, -1)