-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataprocessing.py
116 lines (99 loc) · 4.97 KB
/
dataprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import collections
import itertools
import numpy as np
import pandas as pd
from database import db
def read_experiment(experiment, table):
'''Read one of the measurement tables or notes for a given experiment as a dataframe.'''
with db:
all_tables = [_[0] for _ in
db.execute('''SELECT name FROM sqlite_master
WHERE type='table'
AND name GLOB '*__*' ''')]
assert table in all_tables+['notes'], 'No such table.'
df = pd.read_sql_query('SELECT * FROM %s WHERE experiment_name=? ORDER BY timestamp ASC'%table,
db,
index_col='timestamp',
params=(experiment,))
return df
###############################################################################
# Tools for calculating variables of interested dependent on logged variables.
###############################################################################
def parse_formula(experiment, formula):
'''Return a python function corresponding to the given formula and experiment's strain.'''
with db:
query = db.execute('''SELECT strain_name FROM experiments
WHERE name=?''',
(experiment,))
strain = query.fetchone()[0]
query = db.execute('''SELECT %s FROM strains
WHERE name=?'''%formula,
(strain,))
formula = query.fetchone()[0]
formula = eval('lambda x:'+formula, np.__dict__)
return formula
def read_OD(experiment):
'''Prepare a dataframe of OD values.'''
light_in = read_experiment(experiment, 'light_in__uEm2s')
light_out = read_experiment(experiment, 'light_out__uEm2s')
OD = light_in.copy()
OD['data'] = light_out['data']/light_in['data']
formula = parse_formula(experiment, 'light_ratio_to_od_formula')
OD['data'] = OD['data'].apply(formula)
return OD
def read_cell_count(experiment):
'''Prepare a dataframe of biomass values.'''
OD = read_OD(experiment)
formula = parse_formula(experiment, 'od_to_cell_count_formula')
OD['data'] = OD['data'].apply(formula)
return OD
def read_biomass(experiment):
'''Prepare a dataframe of biomass values.'''
OD = read_OD(experiment)
formula = parse_formula(experiment, 'od_to_biomass_formula')
OD['data'] = OD['data'].apply(formula)
return OD
###############################################################################
# Tools to access all variables that migth be of interest (logged or calculated).
###############################################################################
# A convenient container for everything necessary to define a plot.
PlotType = collections.namedtuple('PlotType', ['reader', 'min', 'max'])
# Most of the database-to-dataframe functions need to read a single table,
# so we are making a function that returns such reader functions.
make_reader = lambda table: lambda experiment: read_experiment(experiment, table)
# A container of all predefined plots.
possible_plots = collections.OrderedDict([
('light in' , PlotType(make_reader('light_in__uEm2s') , 0, 3)),
('light out' , PlotType(make_reader('light_out__uEm2s'), 0, 3)),
('temperature' , PlotType(make_reader('temperature__C') , 20, 40)),
('added water' , PlotType(make_reader('water__ml') , 0, 5)),
('added media' , PlotType(make_reader('media__ml') , 0, 5)),
('drained volume', PlotType(make_reader('drained__ml') , 0, 5)),
('OD' , PlotType(read_OD , 0, 3)),
('cell count' , PlotType(read_cell_count , 0, 3)),
('biomass' , PlotType(read_biomass , 0, 3)),
])
def read_plottype(experiment, plot_type):
'''Prepare a dataframe with all the data of interest for a given experiment and plot type.'''
df = plot_type.reader(experiment)
del df['experiment_name']
df['avg'] = df['data'].apply(lambda _:_.mean())
df['median'] = df['data'].apply(lambda _:np.median(_))
df['min'] = df['data'].apply(lambda _:_.min())
df['max'] = df['data'].apply(lambda _:_.max())
for r in range(4):
df['r%d'%(r+1)] = df['data'].apply(lambda _:_[r,:].mean())
for c in range(5):
df['c%d'%(c+1)] = df['data'].apply(lambda _:_[:,c].mean())
for r,c in itertools.product(range(4),range(5)):
df['%s%s'%(r+1,c+1)]=df['data'].apply(lambda _:_[r,c])
del df['data']
return df
def read_all_plottypes(experiment, interpolate=True):
'''Like `read_plottype` but for all defined plot types. Interpolation is optional.'''
ts = [read_plottype(experiment,v) for v in possible_plots.values()]
df = pd.concat([_.transpose() for _ in ts], keys=possible_plots.keys()).transpose()
df.columns.names = ['plot type', 'well']
if interpolate:
df.interpolate(method='time', limit_direction='both')
return df