/
reconcentrator.py
executable file
·392 lines (348 loc) · 17.1 KB
/
reconcentrator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
#!/usr/bin/env python2.7
import couchdb
import re
import math
from collections import defaultdict, Counter, OrderedDict
import unicodedata
import csv
import copy
import click
import pdb
from time import time
from datetime import datetime
from genologics.config import BASEURI, USERNAME, PASSWORD
from genologics.lims import Lims
from genologics.entities import Process
from __builtin__ import False
#Assumes ind. sample conc measurements have failed. As such it relies on changing relative volume on already normalized samples and structure
#Structure are retained as conc measurements failure means there's no way to know conc. delta between samples from seperate poolss
def connection():
couch = couchdb.Server('http://isak:Purpleplant89@tools-dev.scilifelab.se:5984')
return couch
#Fetches the structure of a project
def proj_struct(couch, project, target_clusters):
db = couch['x_flowcells']
view = db.view('names/project_ids_list')
fc_track = defaultdict(set)
#Adds flowcells to ALL projects. Due to intractions its easier to just get FCs for ALL projects
for rec in view.rows:
fc = ''.join(rec.key)
fc = unicodedata.normalize('NFKD', fc).encode('ascii','ignore')
id = ''.join(rec.id)
id = unicodedata.normalize('NFKD', id).encode('ascii','ignore')
for projs in rec.value:
projs = ''.join(projs)
projs = unicodedata.normalize('NFKD', projs).encode('ascii','ignore')
if fc_track[projs] == set([]):
fc_track[projs] = dict()
fc_track[projs][fc] = id
#Adds lanes and samples to flowcells, includes samples from other projects if they share lane
if fc_track[project] == set([]):
raise Exception('Error: Project not logged in x_flowcells database!')
for fc, id in fc_track[project].items():
try:
entry = db[id]['illumina']
except KeyError:
print "Error: Illumina table for db entry" , id, "doesn't exist!"
entry = db[id]['illumina']['Demultiplex_Stats']['Barcode_lane_statistics']
for index in xrange(0, len(entry)):
lane = entry[index]['Lane']
sample = entry[index]['Sample']
if 'Clusters' in entry[index]:
clusters = entry[index]['Clusters']
else:
clusters = entry[index]['PF Clusters']
clusters = int(re.sub(r",", "", clusters))
if not isinstance(fc_track[project][fc], dict):
fc_track[project][fc] = dict()
if not lane in fc_track[project][fc]:
fc_track[project][fc][lane] = dict()
#Only counts samples for the given project, other samples are "auto-filled"
if project in sample:
fc_track[project][fc][lane][sample] = clusters
else:
fc_track[project][fc][lane][sample] = target_clusters
#Removes any lanes that don't have any part project samples
for fc, lanes in fc_track[project].items():
for lane,sample in lanes.items():
if not any(project in s for s in sample.keys()):
del fc_track[project][fc][lane]
return fc_track[project]
def parse_indata(struct, target_clusters):
clusters_rem = dict()
clusters_expr = dict()
lane_maps = dict()
counter = 1
#Takes in data and finds unique lane structure, clusters per sample and lane division
#Output could probably be sent as a nested hash.
#Clusters_rem AND clusters_expr may seem redundant, but it saves some calculatin
for fc, lanes in struct.items():
for lane, samples in lanes.items():
#Concatinate structure into a set of unique structures
mapping = sorted(samples.keys(), reverse=True)
if not mapping in lane_maps.values():
lane_maps[counter] = mapping
counter +=1
#Calculate clusters read per sample
for sample, value in samples.items():
if not sample in clusters_rem:
clusters_rem[sample] = target_clusters
clusters_expr[sample] = 0
clusters_rem[sample] -= value
clusters_expr[sample] += value
return [lane_maps, clusters_rem, clusters_expr]
#Creates a set where every sample uniquely appears once and only once
def simple_unique_set(lane_maps):
unique_lane_maps = dict()
for keyz, valz in lane_maps.items():
#Fetch what lanes inclusion of given lane excludes
excluded = list()
for sample in valz:
for k, v in lane_maps.items():
for dupe in v:
if dupe == sample and keyz != k and sample != 'Undetermined':
excluded.append(k)
break
#Check that none of the excluded lanes have uniquely present samples
acceptable = True
for key in excluded:
total_duplicates = list()
for values in lane_maps[key]:
duplicate = 0
for k, v in lane_maps.items():
for dupe in v:
if dupe == values and k != key and sample != 'Undetermined':
duplicate +=1
break
total_duplicates.append(duplicate)
if 0 in total_duplicates:
acceptable = False
break
if acceptable:
#Check that the lane doesn't have sample dupes in the accepted set already
for entries in valz:
for kuyz, vulz in unique_lane_maps.items():
for things in vulz:
if things == entries and entries != 'Undetermined':
acceptable = False
break
if acceptable:
unique_lane_maps[keyz] = valz
lane_maps = unique_lane_maps
#ALL SAMPLES PRESENT CHECK
# summap = []
# for k in lane_maps.keys():
# summap += lane_maps[k]
# print len(set(summap))
validate_template_struct(lane_maps)
def aggregator(lane_maps,clusters_rem,clusters_per_lane):
#Iterate
#Find all samples that are also expressed in another struct
#Sort those structs by duplication
#Fill them to floor(dups); unless mod % 1 > some_number; then ceil(dups)
#Note the remaining necessary
#End
#Use the remaining structs
#Ceil(dups) those babies
raise Exception('Error: Not yet implemented!')
#Gives how many percent of the lane should be allocated to a specific sample
def sample_distributor(lane_maps, clusters_rem, clusters_per_lane):
ideal_ratios = dict()
req_lanes = dict()
for index in lane_maps:
summ = 0
for entry in lane_maps[index]:
if clusters_rem[entry] > 0:
summ += clusters_rem[entry]
for entry in lane_maps[index]:
if not index in ideal_ratios:
ideal_ratios[index] = list()
if clusters_rem[entry] > 0:
ideal_ratios[index].append(clusters_rem[entry]/float(summ))
else:
ideal_ratios[index].append(0.0)
#Minimal number of required lanes per pool
req_lanes[index] = summ/float(clusters_per_lane)
#Have to be rounded up, rounding down when only using duplicates makes no sense
total_lanes = map(math.ceil, req_lanes.values())
return [ideal_ratios, req_lanes, total_lanes]
#Crude way to check that no samples are in different TYPES of lanes
def validate_template_struct(lane_maps):
tempList = list()
for k, v in lane_maps.items():
for index in xrange(1,len(v)):
if not v[index] == 'Undetermined':
tempList.append(v[index])
counter = Counter(tempList)
for values in counter.itervalues():
if values > 1:
raise Exception('Error: This app does NOT handle situations where a sample'
'is present in lanes/well with differing structure!')
#Corrects volumes since conc is non-constant
#Also normalizes the numbers
#Finally translates float -> int without underexpressing anything
def correct_numbers(lane_maps, clusters_expr, ideal_ratios, req_lanes, total_lanes):
# Since some samples are strong and some weaksauce
# 10% in ideal_ratios does not mean 10% of lane volume
# As such, ideal_ratios need to be divided by actual_reads/expected_reads
# Ignores undetermined clusters in calculation
# Assumes sample conc cant be altered; aka only volume is modified
for ind in xrange(1, len(lane_maps.keys())+1):
#Bases w/o sample are not expected
if len(lane_maps[ind]) != 1:
exp = 1/float(len(lane_maps[ind])-1)
else:
exp = 1
laneTypeExpr = 0
counter = 0
for sample in lane_maps[ind]:
if not sample == 'Undetermined':
laneTypeExpr += clusters_expr[sample]
for sample in lane_maps[ind]:
act = clusters_expr[sample]/float(laneTypeExpr)
ideal_ratios[ind][counter] = ideal_ratios[ind][counter]*(exp/act)
counter += 1
#Normalizes numbers
for index in xrange(1, len(ideal_ratios.keys())+1):
curSum = sum(ideal_ratios[index])
for sample in xrange(0, len(ideal_ratios[index])):
if curSum == 0:
ideal_ratios[index][sample] = 0
else:
ideal_ratios[index][sample] = (ideal_ratios[index][sample]/curSum)*100
# Iteratively rounds to whole percent (min pipette for volume) to reach 100%
# ideal_ratio * req_lanes.values() = needed
# acc_ratio * total_lanes = current
# means a sample can take any whole number between the two
acc_ratios = copy.deepcopy(ideal_ratios)
for index in xrange(1, len(ideal_ratios.keys())+1):
for sample in xrange(0, len(ideal_ratios[index])):
acc_ratios[index][sample] = math.ceil(ideal_ratios[index][sample])
if sum(acc_ratios[index]) == 100:
break
else:
while sum(acc_ratios[index]) > 100:
stuck = True
for sample in xrange(1, len(ideal_ratios[index])):
need = ideal_ratios[index][sample]*req_lanes.values()[index-1]
cur = (acc_ratios[index][sample] - 1)*total_lanes[index-1]
if sum(acc_ratios[index]) > 100 and cur >= need:
acc_ratios[index][sample] -= 1
stuck = False
if sum(acc_ratios[index])== 100:
break
if(stuck):
total_lanes[index-1] += 1
return acc_ratios
def generate_output(project, destid, total_lanes, req_lanes, lane_maps, acc_ratios):
#Gathers the container id and well name for all samples in project
#Cred to Denis for providing a base epp
location = dict()
lims = Lims(BASEURI, USERNAME, PASSWORD)
allProjects = lims.get_projects()
for proj in allProjects:
if proj.id == project:
projName = proj.name
break
#All normalization processes for project
norms=['Library Normalization (MiSeq) 4.0', 'Library Normalization (Illumina SBS) 4.0','Library Normalization (HiSeq X) 1.0']
pros=lims.get_processes(type=norms, projectname=projName)
#For all processes
for p in pros:
#For all artifacts in process
for o in p.all_outputs():
#If artifact is analyte type and has project name in sample
if o.type=="Analyte" and project in o.name:
location[o.name.split()[0]] = list()
location[o.name.split()[0]].append(o.location[0].id)
location[o.name.split()[0]].append(o.location[1])
#PRINT section
#Print stats including duplicates
timestamp = datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H:%M')
sumName = projName, "_summary_", timestamp,".txt"
sumName = ''.join(sumName)
with open(sumName, "w") as summary:
if sum(req_lanes.values()) != 0:
OPT = sum(total_lanes)/sum(req_lanes.values())
else:
OPT = 0
output = "Ideal lanes (same schema): ", str(sum(req_lanes.values())) , ", Total lanes: ", str(sum(total_lanes)), ", OPT: ", str(round(OPT,3)),'\n'
output = ''.join(output)
summary.write( output )
output = "Unique pools: ", str(len(total_lanes)), ", Average pool duplication: ", str(sum(total_lanes)/float(len(total_lanes))) ,'\n'
output = ''.join(output)
summary.write( output )
bin = 0
for index in xrange(1, len(lane_maps)+1):
bin += 1
summary.write('\n')
output = "Wells ", str(bin) , '-' , str(bin+int(total_lanes[index-1])-1),':','\n'
output = ''.join(output)
summary.write( output )
bin += int(total_lanes[index-1]-1)
for counter in xrange(1, len(lane_maps[index])):
output = str(lane_maps[index][counter]),' ', str(acc_ratios[index][counter]), "%",'\n'
output = ''.join(output)
summary.write( output )
#Creates csv
name = projName,"_repool_",timestamp,".csv"
name = ''.join(name)
wells = ['Empty','A','B','C','D','E','F','G','H']
#Index 0 is number, index 1 is Letter
wellIndex = [1, 1]
destNo = 0
with open(name, 'w') as csvfile:
writer = csv.writer(csvfile)
for index in xrange(1, len(lane_maps)+1):
for dupes in xrange(1, int(total_lanes[index-1])+1):
if lane_maps[index] == 0:
raise Exception('Error: Project not logged in x_flowcells database!')
for counter in xrange(1, len(lane_maps[index])):
#<source plate ID>,<source well>,<volume>,<destination plate ID>,<destination well>
#Destination well 200 microL, minimum pipette 2 microL; acc_ratios multiplied by 2.
sample = lane_maps[index][counter]
position = wells[wellIndex[1]],':',str(wellIndex[0])
position = ''.join(position)
try:
output = location[sample][0],location[sample][1],str(int(acc_ratios[index][counter]*2)),str(destid[destNo]),position
except KeyError:
print "Error: Samples incorrectly parsed into database, thus causing sample name conflicts!"
if not acc_ratios[index][counter] == 0:
writer.writerow(output)
#Increment wellsindex
if not acc_ratios[index][counter] == 0:
if not wellIndex[1] >= 8:
wellIndex[1] += 1
else:
wellIndex[1] = 1
if not wellIndex[0] >= 8:
wellIndex[0] += 1
else:
wellIndex[0] = 1
destNo += 1
try:
destid[destNo]
except IndexError:
print "Critical error; not enough destination plates provided"
@click.command()
@click.option('--project_id', required=True,help='REQUIRED: ID of project to repool. Examples:P2652, P1312 etc.')
@click.option('--dest_plate_list', default=['dp_1','dp_2','dp_3','dp_4','dp_5'],
help='List of destination plates for the robot\'s csv file. Include too many rather than too few; excess will be unused Default:[dp_1,dp_2,dp_3,dp_4,dp_5]')
@click.option('--target_clusters', default=320*1000000, help='Threshold of clusters per sample. \nDefault:320*1000000')
@click.option('--clusters_per_lane', default=380*1000000, help='Expected clusters generated by a single lane/well. \nDefault:380*1000000')
@click.option('--allow_non_dupl_struct', is_flag=True, help='Allow for samples to be present in different types of flowcells')
def main(target_clusters, clusters_per_lane, project_id, dest_plate_list, allow_non_dupl_struct):
"""Application that calculates samples under threshold for a project, then calculate the optimal composition for reaching the threshold
without altering concentrations nor the structure of the pools. Outputs both a summary as well as a functional csv file."""
couch = connection()
structure = proj_struct(couch, project_id, target_clusters)
[lane_maps, clusters_rem, clusters_expr] = parse_indata(structure, target_clusters)
if allow_non_dupl_struct:
aggregator(lane_maps,clusters_rem,clusters_per_lane)
else:
simple_unique_set(lane_maps)
[ideal_ratios, req_lanes, total_lanes] = sample_distributor(lane_maps, clusters_rem, clusters_per_lane)
acc_ratios = correct_numbers(lane_maps, clusters_expr, ideal_ratios, req_lanes, total_lanes)
generate_output(project_id, dest_plate_list, total_lanes, req_lanes, lane_maps, acc_ratios)
if __name__ == '__main__':
main()