/
analyze_subs.py
81 lines (53 loc) · 1.79 KB
/
analyze_subs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import numpy as np
def thrensemble(scores_arr, dets_arr, threshes):
final = []
for i in range(len(scores_arr[0])):
for j, (score, thresh) in enumerate(zip(scores_arr[i], threshes)):
if score > thresh:
final.append(dets_arr[i][j])
break
else:
final.append(np.nan)
return final
def get_nth_largest_proba(ser, n=300):
probas = get_top_proba(ser)
return probas.nlargest(n).values[-1]
def thresh_to_nth_largest_proba(ser, n):
probas = get_top_proba(ser)
cutoff = get_nth_largest_proba(ser, n)
mask = probas < cutoff
ret_ser = ser.copy()
ret_ser.loc[mask] = np.nan
return ret_ser
def get_top_proba(yday):
return yday.fillna('.00 ').str.strip().str.partition(' ')[0].astype(float)
def save_sub(ser, path):
df = ser.rename_axis(PATIENT_ID).to_frame(P)
df.to_csv(path, index=True)
return path
def lmap(fn, coll): return list(map(fn, coll))
def run_thrensemble(sub_df, threshes=[.15, .15]):
scores_arr = sub_df.apply(get_top_proba).values
dets_arr = sub_df.values
return thrensemble(scores_arr, dets_arr, threshes)
import funcy
def get_1_or_2(x, cut1, cut2):
"""Unused"""
if len(x) == 0:
return np.nan
new_dets = []
x = lmap(float, x)
chunks = list(funcy.chunks(5, x))
for i, c in enumerate(chunks):
if c[0] > cut1:
new_dets.append(c)
elif (i == 1) and c[0] > cut2:
assert len(new_dets) == 1
new_dets.append(c)
else:
break
return ' '.join(list(funcy.flatten(new_dets)))
def make_1_or_2_ser(ser, cut1, cut2):
splat = ser.fillna(' ').str.strip().str.split(' ')
dets = [get_1_or_2(x, cut1, cut2) for x in splat.values]
return pd.Series(dets, index=ser.index)