-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
189 lines (133 loc) · 5.53 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
from textwrap import wrap
from collections import OrderedDict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.lines import Line2D
from lifelines import KaplanMeierFitter
from lifelines.plotting import plot_lifetimes
from lifelines.utils import datetimes_to_durations
from utils import start_end_for_diagnosis, image_sequence
diagnosis_list = OrderedDict({
'I63.0': 'I63.0: Cerebral infarct, large vessel disease with significant carotid stenosis',
'I63.3': 'I63.3: Cerebral infarct, other large vessel disease',
'I63.4': 'I63.4: Cerebral infarct, cardic emboli',
'I63.5': 'I63.5: Cerebral infarct, small vessel / lacunar',
'I63.8': 'I63.8: Cerebral infarct, other / unusual cause',
'I63.9': 'I63.9: Cerebral infarct, multiple / unknown cause'
})
GEN = image_sequence()
def kaplan_meier_analysis(df, rows=2, columns=3):
"""
Kaplan-Meier estimates for individual diagnoses
:param df: Pandas dataframe with data
:param rows: number of rows for subplots
:param columns: number of columns for subplots
:return: Plots the data
"""
fig, axes = plt.subplots(rows, columns, figsize=[10, 6])
for pos, diagnosis in enumerate(diagnosis_list):
timeline = np.linspace(0, 12)
# initialize start and end times
start_times, end_times = start_end_for_diagnosis(df, diagnosis)
# get data in the right format - t is time_span, e is event (1 is death)
t, e = datetimes_to_durations(start_times, end_times, freq='M') # M - months, D - days
# initialize Kaplan-Meier fitters
kmf = KaplanMeierFitter()
# fit the data
kmf.fit(t, event_observed=e, timeline=timeline)
# get the plot position
ax: plt.Axes = axes[pos % rows, pos % columns]
# plot Kaplan-Meier
kmf.plot(ax=ax)
# create legend
legend_elements = [Line2D([0], [0], color='b', lw=1, label=f'n = {len(start_times)}')]
# format plot
ax.set_ylim(0, 1)
ax.set_xlabel('Time in months')
ax.set_title('\n'.join(wrap(diagnosis_list[diagnosis], 30)))
ax.legend(handles=legend_elements, loc='lower right')
ax.margins(y=50)
# show plot
fig.show()
fig.savefig(f'paper/img/image_{next(GEN)}.pdf', format='pdf')
def plot_lifetimes_for_diagnosis(df, diagnosis, current_time=50, subset_size=80):
df = df.loc[df['diagnosis'] == diagnosis]
# create figure and axes
fig, ax = plt.subplots()
# specify type for PyCharm help
fig: plt.Figure = fig
ax: plt.Axes = ax
if df.shape[0] >= subset_size:
df = df.sample(n=subset_size, random_state=1)
start_times, end_times = df['start_date'], df['end_date']
actual_lifetimes, death_observed = datetimes_to_durations(start_times, end_times, freq='M')
plot_lifetimes(durations=actual_lifetimes, event_observed=death_observed, ax=ax)
ax.set_title(diagnosis_list[diagnosis])
ax.set_xlabel('Čas od začátku sledování po vznik události v měsících')
ax.set_ylabel('Sledovaná osoba')
# show and save plot
fig.show()
fig.savefig(f'paper/img/image_{next(GEN)}.pdf', format='pdf')
def plot_hist(df: pd.DataFrame, column, x_label, y_label, title='', x_lim=0, x_ticks: np.ndarray = None):
# create figure and axes
fig, ax = plt.subplots()
# specify type for PyCharm help
fig: plt.Figure = fig
ax: plt.Axes = ax
# plot histogram
labels, counts = np.unique(df[column], return_counts=True)
plt.bar(labels, counts, align='center')
plt.gca().set_xticks(labels)
# set properties
if x_ticks is not None:
ax.set_xticks(x_ticks)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.set_title(title)
ax.set_xlim(x_lim - 0.5)
# show and save plot
fig.show()
fig.savefig(f'paper/img/image_{next(GEN)}.pdf', format='pdf')
def plot_comparison(df, x, x_label, y_label, title=''):
fig, ax = plt.subplots()
data = {'x': x, 'y': df['mrs_1y'] == 6}
sns.regplot(x='x', y='y', data=data, logistic=True, n_boot=100, ax=ax)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.set_title(title)
# show and save plot
fig.show()
fig.savefig(f'paper/img/image_{next(GEN)}.pdf', format='pdf')
def main():
# read cleaned data
df = pd.read_csv('data.csv')
plot_hist(df, 'age', 'Věk', 'Počet pacientů', x_lim=25, x_ticks=np.arange(25, 100, 5))
plot_hist(df, 'mrs_1y', 'mRS - 1Y', 'Počet pacientů')
plot_hist(df, 'diagnosis', 'Diagnóza', 'Počet pacientů')
# Age to mRS comparison
plot_comparison(df, df['age'], 'Věk', 'mRS - 1Y')
# TSS to mRS comparison
plot_comparison(df, df['tss_in'], 'Vstupní TSS', 'mRS - 1Y')
# TSS difference to mRS comparison
plot_comparison(df, df['tss_in'] - df['tss_out'], 'Rozdíl vstupního a výstupního TSS', 'mRS - 1Y')
# Sex to mRS comparison
fig, ax = plt.subplots()
fig: plt.Figure = fig
ax: plt.Axes = ax
data = {'x': np.asarray([0 if i is 'f' else 1 for i in df['sex']]),
'y': df['mrs_1y'] == 6}
sns.regplot(x='x', y='y', data=data, logistic=True, n_boot=100, ax=ax)
ax.set_xticks(np.arange(0, 1 + 0.1, 1))
ax.set_xticklabels(['Žena', 'Muž'])
ax.set_ylabel('mRS - 1Y')
ax.set_xlabel('Pohlaví')
fig.show()
fig.savefig(f'paper/img/image_{next(GEN)}.pdf', format='pdf')
# plot lifetimes graph
plot_lifetimes_for_diagnosis(df, diagnosis='I63.3')
# plot kaplan-meier curves
kaplan_meier_analysis(df)
if __name__ == '__main__':
main()