/
repeated_reco.py
executable file
·1176 lines (999 loc) · 39.5 KB
/
repeated_reco.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/bin/sh /cvmfs/icecube.opensciencegrid.org/py2-v1/icetray-start
#METAPROJECT /storage/home/jll1062/build/pingusoft/trunk
"""
HybridReco run with various MultiNest parameters for accuracy comparisons.
"""
# TODO: Time remaining considering number of events in an I3 file is > 1
# TODO: Not tested on PINGU yet (see esp. geometry, srt_pulse_name, and
# segment_length)
# TODO: Use sqlite as more advanced jobs queue, where filesystem is not read
# every time but the sqlite db is updated manually by some process, but then
# worker threads have to tell the sqlite db when they've completed processing a
# file and that file is flagged as "done" and won't be re-processed
from __future__ import absolute_import, division, print_function
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from collections import Mapping, OrderedDict
from copy import deepcopy
from fcntl import flock, LOCK_EX, LOCK_NB
from functools import partial
import getpass
from glob import glob
import grp
from itertools import product
import operator
from os import environ, getpid, remove
from os.path import abspath, basename, dirname, getsize, isdir, isfile, join
import pwd
import random
import re
import signal
import socket
import sys
import time
from dateutil.parser import parse as date_parse
from dateutil.tz import tzlocal
import numpy as np
# Justin's personal scripts (from ~jll1062/mypy/bin)
from genericUtils import (expand, hrlist2list, list2hrlist, mkdir, chown_and_chmod,
timediffstamp, timestamp, wstderr, wstdout)
from smartFormat import lowPrec
__all__ = ['EXTENSION', 'LOCK_SUFFIX', 'LOCK_SEP', 'LOCK_FMT',
'LOCK_ACQ_TIMEOUT', 'RECO_RE', 'NUM_LIVEPOINTS', 'GROUP', 'GID',
'MODE',
'TOLERANCES', 'FIT_FIELD_SUFFIX',
'MN_DEFAULT_KW', 'RECOS', 'MIN_RECO_TIME',
'get_process_info', 'EventCounter', 'construct_reco_name',
'recos_from_path', 'path_from_recos', 'parse_args', 'main']
RECOS_SET = 2
DRAGON_L5_CRITERIA = '''(
frame.Has('IC86_Dunkman_L3') and frame['IC86_Dunkman_L3']
and frame.Has('IC86_Dunkman_L4') and (frame['IC86_Dunkman_L4']['result'] == 1)
and frame.Has('IC86_Dunkman_L5') and (frame['IC86_Dunkman_L5']['bdt_score'] >= 0.2)
)'''
EXTENSION = '.i3.bz2'
LOCK_SUFFIX = '.lock'
LOCK_SEP = ' = '
LOCK_FMT = '%s' + LOCK_SEP + '%s\n'
LOCK_ACQ_TIMEOUT = 0.01 # sec
RECO_RE = re.compile(r'_recos([\s0-9,\-]+)')
GROUP = 'dfc13_collab'
GID = None
try:
GID = grp.getgrnam(GROUP).gr_gid
except KeyError:
GID = pwd.getpwnam(getpass.getuser()).pw_gid
MODE = 0o666
def get_process_info():
"""Get metadata bout the running process.
Returns
-------
info : dict
Keys are 'hostname', 'ip_address', 'pid', and 'user'
"""
info = OrderedDict()
info['hostname'] = socket.gethostname()
info['ip_address'] = socket.gethostbyname(info['hostname'])
info['pid'] = getpid()
# getpass.getuser doesn't return what I want when running in a PBS job...
if 'USER' in environ:
info['user'] = environ['USER']
else:
info['user'] = getpass.getuser()
return info
def construct_reco_name(dims, numlive, tol, trial):
"""Construct a canonical name for the HybridReco/MultiNest reconstruction
defined by a few particular parameters that MultiNest takes.
Note that the resulting name is Python-friendly, replacing
* '.' with 'd'
* '+' with 'p'
* '-' with 'm'
Parameters
----------
dims : int
numlive : int
tol : float
trial : int
Returns
-------
reco_name : string
"""
mn_name = 'MN%dD' % dims
numlive_name = 'nlive%d' % numlive
tol_name = 'tol' + lowPrec(tol)
trial_name = 'trial%d' % trial
reco_name = '_'.join([mn_name, numlive_name, tol_name, trial_name])
reco_name = reco_name.replace('.', 'd')
reco_name = reco_name.replace('+', 'p')
reco_name = reco_name.replace('-', 'm')
return reco_name
FIT_FIELD_SUFFIX = '_FitParams'
MN_CONFIG_PREFIX = 'MN_Full_'
MN_DEFAULT_KW = dict(
config_prefix=MN_CONFIG_PREFIX,
segment_length=7, # meters
has_mc_truth=True,
fit_cascade_direction=False,
#input_pulses=args.srt_pulse_name,
usecoszen=True,
mmodal=True,
consteff=False,
#numlive=75,
efr=1.0,
#tol=1.1,
#base_geometry='deepcore',
track_zenith_bounds=[-1, 1],
cascade_zenith_bounds=[-1, 1],
show_feedback=0,
#time_limit=time_limit, # sec
store_llhp_values=True,
raw_output_base_name='./MN-default',
write_raw_output_files=0,
#If=lambda f: f.Has('Cuts_V5.1_Step1') and f['Cuts_V5.1_Step1'].value
)
RECOS = []
if RECOS_SET == 1:
NUM_LIVEPOINTS = [1000, 10000]
TOLERANCES = [1e-2]
# High-resolution MultiNest runs
HIRES_TRIALS = 1
TIME_LIMIT_FACTOR = 1000
for _numlive, _tol, _trial in product(NUM_LIVEPOINTS, TOLERANCES,
list(range(HIRES_TRIALS))):
_time_limit = 60 * int(np.round(np.clip(
_numlive*2/3 + 84,
a_min=10,
a_max=22*60
))) * TIME_LIMIT_FACTOR
_reco_name = construct_reco_name(dims=8, numlive=_numlive, tol=_tol,
trial=_trial)
_kwargs = deepcopy(MN_DEFAULT_KW)
_kwargs['prefix'] = '%s_' % _reco_name
_kwargs['time_limit'] = _time_limit
_kwargs['numlive'] = _numlive
_kwargs['tol'] = _tol
RECOS.append(
dict(name=_reco_name, time_limit=_time_limit, kwargs=_kwargs)
)
# "Standard" HybridReco settings from PINGU, re-run 10 times
for _trial in range(10):
_numlive = 75
_tol = 1.1
_time_limit = 150 * 60 * TIME_LIMIT_FACTOR
_reco_name = construct_reco_name(dims=8, numlive=_numlive, tol=_tol,
trial=_trial)
_kwargs = deepcopy(MN_DEFAULT_KW)
_kwargs['prefix'] = '%s_' % _reco_name
_kwargs['time_limit'] = _time_limit
_kwargs['numlive'] = _numlive
_kwargs['tol'] = _tol
RECOS.append(
dict(name=_reco_name, time_limit=_time_limit, kwargs=_kwargs)
)
# 50 livepoints, repeated 20 times
for _trial in range(20):
_numlive = 50
_tol = 0.01
_time_limit = 100 * 60 * TIME_LIMIT_FACTOR
_reco_name = construct_reco_name(dims=8, numlive=_numlive, tol=_tol,
trial=_trial)
_kwargs = deepcopy(MN_DEFAULT_KW)
_kwargs['prefix'] = '%s_' % _reco_name
_kwargs['time_limit'] = _time_limit
_kwargs['numlive'] = _numlive
_kwargs['tol'] = _tol
RECOS.append(
dict(name=_reco_name, time_limit=_time_limit, kwargs=_kwargs)
)
# 25 livepoints, repeated 20 times
for _trial in range(20):
_numlive = 25
_tol = 0.01
_time_limit = 50 * 60 * TIME_LIMIT_FACTOR
_reco_name = construct_reco_name(dims=8, numlive=_numlive, tol=_tol,
trial=_trial)
_kwargs = deepcopy(MN_DEFAULT_KW)
_kwargs['prefix'] = '%s_' % _reco_name
_kwargs['time_limit'] = _time_limit
_kwargs['numlive'] = _numlive
_kwargs['tol'] = _tol
RECOS.append(
dict(name=_reco_name, time_limit=_time_limit, kwargs=_kwargs)
)
# 10 livepoints, repeated 20 times
for _trial in range(20):
_numlive = 10
_tol = 0.01
_time_limit = 30 * 60 * TIME_LIMIT_FACTOR
_reco_name = construct_reco_name(dims=8, numlive=_numlive, tol=_tol,
trial=_trial)
_kwargs = deepcopy(MN_DEFAULT_KW)
_kwargs['prefix'] = '%s_' % _reco_name
_kwargs['time_limit'] = _time_limit
_kwargs['numlive'] = _numlive
_kwargs['tol'] = _tol
RECOS.append(
dict(name=_reco_name, time_limit=_time_limit, kwargs=_kwargs)
)
elif RECOS_SET == 2:
NUM_LIVEPOINTS = [50, 75]
TOLERANCES = [1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1]
TRIALS = 1
_tr = list(range(TRIALS))
for _trial, _numlive, _tol in product(_tr, NUM_LIVEPOINTS, TOLERANCES):
_time_limit = 479 * 60 # 479 min = 7h 59min = 28740 sec
_reco_name = construct_reco_name(dims=8, numlive=_numlive, tol=_tol,
trial=_trial)
_kwargs = deepcopy(MN_DEFAULT_KW)
_kwargs['prefix'] = '%s_' % _reco_name
_kwargs['time_limit'] = _time_limit
_kwargs['numlive'] = _numlive
_kwargs['tol'] = _tol
RECOS.append(
dict(name=_reco_name, time_limit=_time_limit, kwargs=_kwargs)
)
# We should run multiple recos with nlive={50, 75} and
# tol={1e-3, 1e-2, 1e-1} and also run a bunch of nlive=25 with
# tol={1e-3, 1e-2, 1e-1} repeated versions of the above (recos 12 on) and
# attach these instead
RECOS = RECOS[:12]
NUM_LIVEPOINTS = [25]
TOLERANCES = [1e-3, 1e-2, 1e-1]
TRIALS = 1
_tr = list(range(TRIALS))
for _trial, _numlive, _tol in product(_tr, NUM_LIVEPOINTS, TOLERANCES):
_time_limit = 479 * 60 # 479 min = 7h 59min = 28740 sec
_reco_name = construct_reco_name(dims=8, numlive=_numlive, tol=_tol,
trial=_trial)
_kwargs = deepcopy(MN_DEFAULT_KW)
_kwargs['prefix'] = '%s_' % _reco_name
_kwargs['time_limit'] = _time_limit
_kwargs['numlive'] = _numlive
_kwargs['tol'] = _tol
RECOS.append(
dict(name=_reco_name, time_limit=_time_limit, kwargs=_kwargs)
)
NUM_LIVEPOINTS = [25, 50, 75]
TOLERANCES = [1e-3, 1e-2, 1e-1]
# By now we've already done one trial of each (trial0), so do 9 more and
# start here with trial1
TRIALS = 9
_tr = list(range(1, 1 + TRIALS))
for _trial, _numlive, _tol in product(_tr, NUM_LIVEPOINTS, TOLERANCES):
_time_limit = 479 * 60 # 479 min = 7h 59min = 28740 sec
_reco_name = construct_reco_name(dims=8, numlive=_numlive, tol=_tol,
trial=_trial)
_kwargs = deepcopy(MN_DEFAULT_KW)
_kwargs['prefix'] = '%s_' % _reco_name
_kwargs['time_limit'] = _time_limit
_kwargs['numlive'] = _numlive
_kwargs['tol'] = _tol
RECOS.append(
dict(name=_reco_name, time_limit=_time_limit, kwargs=_kwargs)
)
MIN_RECO_TIME = min([r['time_limit'] for r in RECOS])
def if_proto(frame, reco_base, eval_criteria=None):
"""Prototype function that skips running an IceTray module or segment on a
frame if field_name = `reco_base` + FIT_FIELD_SUFFIX is already in the
frame.
Note that this function must be used with `partial` to populate values for
`reco_base`; this function cannot be used directly as an argument to an
IceTray module's `If` kwarg (as only a single argument, `frame`, is
provided to that function by the IceTray)
"""
from icecube import dataclasses, dataio, icetray, multinest_icetray # pylint: disable=unused-variable, import-error
field_name = reco_base + FIT_FIELD_SUFFIX
eval_result = True
if eval_criteria is not None:
eval_result = eval(eval_criteria) # pylint: disable=eval-used
if not eval_result:
sys.stdout.write('[NOT RUN] Not running reco %s (event did not pass'
' `eval_criteria`)\n' % field_name)
sys.stdout.flush()
return False
if not frame.Has(field_name):
sys.stdout.write('[RUN ] Running missing reco %s\n' % field_name)
sys.stdout.flush()
return True
#if run_if_previous_timed_out and frame[field_name].has_reached_time_limit:
# sys.stdout.write('[RUN ] Running timed-out reco %s\n' % field_name)
# sys.stdout.flush()
# return True
sys.stdout.write('[NOT RUN] Not running reco %s\n' % field_name)
sys.stdout.flush()
return False
# Do not repeat a reconstruction already performed on a given event; the 'If'
# function ensures this isn't done.
for reco in RECOS:
if_func = partial(
if_proto,
reco_base=reco['name'],
eval_criteria=DRAGON_L5_CRITERIA,
#run_if_previous_timed_out=True
)
reco['kwargs']['If'] = if_func
def recos_from_path(filepath):
"""Parse filepath for recos (reported to have been) run on the file.
Parameters
----------
filepath : string
Returns
-------
recos : list of integers
"""
filename = basename(filepath)
recos = [hrlist2list(x) for x in RECO_RE.findall(filename)]
recos = sorted(reduce(operator.add, recos, []))
return recos
def path_from_recos(orig_path, recos, ext=EXTENSION):
"""Construct a new filename given original path/filename and a list of
reconstructions that have been run on the file.
Parameters
----------
orig_path : string
Original filename or path to modify in order to construct a new
filename/path
recos : sequence of zero or more integers
Which reconstructions were run on the file
ext : string
Extension of original file path; used also in new path
Returns
-------
new_path : string
"""
assert orig_path.endswith(ext)
# Strip the extension
orig_path = orig_path[:-len(ext)]
# Construct a concise string indicating recos run (or '' if none were run)
if recos:
reco_str = '_recos' + list2hrlist(sorted(recos))
else:
reco_str = ''
# Put it all together
return RECO_RE.sub('', orig_path) + reco_str + ext
def acquire_lock(lock_path, lock_info=None):
"""Acquire a lock on the file at `lock_path` and record `lock_info` to
that file.
Parameters
----------
lock_path : string
lock_info : None or Mapping
Returns
-------
lock_f : file object
This holds an exlcusive lock; close the file or use fcntl.flock to
release the lock.
Raises
------
IOError: [Errno 11] Resource temporarily unavailable
The lock is held by a different process on the file. Note that the
same process can re-acquire a lock infinitely many times (but there
is no lock counter, so the first file descriptor to be closed or
explicitly release the lock also releases the lock for all other
instaces within the process).
ValueError: I/O operation on closed file
This might be the case if the file has disappeared between opening it
and actually acquiring the exclusive lock.
Notes
-----
See
https://loonytek.com/2015/01/15/advisory-file-locking-differences-between-posix-and-bsd-locks
for more info about locks. Note that this function uses flock, i.e.
POSIX--not BSD--locking. This means that it should work even with an NFS
filesystem, although there are other tradeoffs as well. And locking is
"cooperative," so another process can simply ignore the `flock` locking
protocol altogether and read/write/delete the file.
"""
lock_acq_timeout_time = time.time() + LOCK_ACQ_TIMEOUT
lock_f = file(lock_path, 'a')
lock_acquired = False
while time.time() <= lock_acq_timeout_time:
try:
flock(lock_f, LOCK_EX | LOCK_NB)
except IOError, err:
if err.errno == 11:
wstdout('.')
time.sleep(random.random()*LOCK_ACQ_TIMEOUT/100)
continue
else:
raise
else:
lock_acquired = True
if not lock_acquired:
exc = IOError('[Errno 11] Resource temporarily unavailable')
exc.errno = 11
raise exc
if lock_info is not None:
assert isinstance(lock_info, Mapping)
# Write info out to the lock through a new, write-able file
# descriptor; note that the lock is still held by the `lock`
# file descriptor.
with file(lock_path, 'w') as lock_w:
for k, v in lock_info.items():
lock_w.write(LOCK_FMT % (k, v))
try:
chown_and_chmod(lock_w, gid=GID, mode=MODE)
except OSError, err:
# errno 1 : operation not permitted (allowing this)
if err.errno != 1:
raise
return lock_f
def read_lockfile(path):
"""Read the contents of a lockfile and convert into an OrderedDict"""
with file(path, 'r') as f:
lines = f.readlines()
lock_info = OrderedDict()
for line in lines:
k, v = line.split(LOCK_SEP)
if k in ['acquired_at', 'expires_at']:
dt = date_parse(v)
# Timestamps are all in UTC, but `time.time()` (used within the
# script) uses localtime, so convert to local timezone then convert
# to integer seconds since epoch
v = int(dt.astimezone(tzlocal()).strftime('%s'))
if k in ['pid']:
v = int(v)
lock_info[k] = v
return lock_info
def cleanup_lock_f(lock_f, force_remove=False):
"""Remove a lock file and release the lock held on it (in that order, to
ensure another process doesn't create a new lockfile after releasing lock
and prior to this function removing the file). If that fails, though,
release the lock and remove the file.
Parameters
----------
lock_f : open file object, or None
If None, this simply returns without an exception.
force_remove : bool
Remove the lock file even if we don't hold a lock on it. WARNING! This
is unsafe behavior in multi-threaded/multi-processing situations.
Raises
------
AssertionError
If lock_f is not a file object
ValueError
If lock_f is a closed file object and `force_remove` is False
"""
if lock_f is None:
return
assert isinstance(lock_f, file)
if lock_f.closed:
if force_remove:
try:
remove(lock_f.name)
except OSError, err:
# OSError.errno of 2 means file doesn't exist, which is fine;
# otherwise, raise the exception since the file could not be
# deleted.
if err.errno != 2:
raise
else:
raise ValueError(
'Lock file is already closed; refusing to remove file.'
)
else:
removed = False
retry = 0
for retry in range(10):
if removed:
break
try:
remove(lock_f.name)
except OSError, err:
if err.errno == 2:
removed = True
break
elif err.errno == 5:
removed = False
retry += 1
time.sleep(0.01)
lock_f.close()
if not removed:
try:
remove(lock_f.name)
except OSError, err:
# errno: 2=no file; 16=Device or resource busy
if err.errno not in [2, 16]:
raise
class EventCounter(object):
"""
Determine whether or not to process an event based on the event number in
the I3 file(s) being processed. Intended for the `process_event` method to
be used as an IceTray module.
Parameters
----------
srt_pulse_name : string
Name of the pulse series to look for in the frame. This is used as a
proxy to identify a frame as an event that should be counted.
skip : int >= 0
Number of events to skip over. If 0, no events will be skipped.
n_events : int
Total number of events to process, starting from `skip`. If <= 0,
*all* events will be processed starting from `skip`.
"""
def __init__(self, srt_pulse_name, skip, n_events):
self.srt_pulse_name = srt_pulse_name
self.skip = skip
self.n_events = n_events
self.event_number = -1
self.events_run = []
def process_event(self, *args, **kwargs):
"""Pass this method as an IceTray module."""
if 'frame' in kwargs.keys():
frame = kwargs['frame']
elif not args:
frame = None
elif len(args) == 1:
frame = args[0]
elif len(args) > 1:
raise ValueError('Got %d frames, can only handle 1.'
% len(args))
if frame is None:
return False
if frame.Has(self.srt_pulse_name):
self.event_number += 1
else:
return False
if (self.event_number < self.skip
and not (self.skip == 0 and self.event_number == -1)):
return False
if self.n_events > 0 and self.event_number >= self.skip + self.n_events:
return False
wstdout('> Processing an event; total events processed will be %5d\n'
% (self.event_number + 1))
self.events_run.append(self.event_number)
return True
class FileLister(object):
"""List file(s), ignoring any that are locked.
Note that either `infile` or `indir` must be specified, but not both.
Parameters
----------
infile : string
File to return (at most once).
indir : string
Directory to search for unlocked files.
"""
def __init__(self, infile=None, indir=None):
self.infile = infile
self.indir = indir
self.used_file = False
if self.infile is not None:
assert self.indir is None
self.mode = 'infile'
self.files = [infile]
elif self.indir is not None:
self.mode = 'indir'
self.files = glob(join(self.indir, '*' + EXTENSION))
else:
raise ValueError('Either `infile` or `indir` must not be None.')
#random.shuffle(self.files)
self.next_file = None
def get_next_file(self):
"""Retrieve the next file.
Returns
-------
next_file : string
"""
if self.mode == 'infile':
if self.used_file:
self.next_file = None
else:
self.next_file = self.infile
self.used_file = True
elif self.mode == 'indir':
self.next_file = self._get_file_from_dir()
return self.next_file
def _get_file_from_dir(self):
while self.files:
f = self.files.pop()
return f
return None
def parse_args(descr=__doc__):
"""Parse command line arguments"""
parser = ArgumentParser(
description=descr,
formatter_class=ArgumentDefaultsHelpFormatter
)
parser.add_argument(
'--infile',
default=None,
help='''Path to the input file. If specified, do not specify --indir'''
)
parser.add_argument(
'--indir',
default=None,
help='''Path to the input directory, from which all I3 files will be
processed. If specified, to not specify --infile'''
)
parser.add_argument(
'--outdir',
required=True,
help='''Output directory; must not be same as indir (or infile's
directory), as the source file may be corrupted or removed if an error
is encountered.''',
)
parser.add_argument(
'--gcd',
required=True,
help='Path to GCD file',
)
parser.add_argument(
'--skip',
type=int, default=0,
help='''Number of events to skip. Note that an "event" is defined as a
frame containing the SRT_PULSE_NAME.''',
)
parser.add_argument(
'--n-events',
type=int, default=0,
help='''Total number of "events" to process (n-events <= 0 processes
all events in the file(s) starting from from --skip). Note that an
event is defined as a frame containing the SRT_PULSE_NAME.''',
)
parser.add_argument(
'--recos',
type=str.lower, default='all',
help='''Reco numbers to run. Specify "all" or a human-readable list,
like "0-2,5" (which would perform steps 0, 1, 2, and 5). Note that
indexing begins at 0.''',
)
parser.add_argument(
'--detector',
required=True, type=str.lower, choices=['deepcore', 'pingu'],
help='''Detector for which the MC or data was produced (this selects an
appropriate SRT_PULSE_NAME)'''
)
parser.add_argument(
'--minutes-remaining',
type=float, default=np.inf,
help='''Minutes remaining in a job to run a reco; only those
reconstructions with time limits less than this will run. Specify <= 0
for no limit.'''
)
parser.add_argument(
'--hours-remaining',
type=float, default=np.inf,
help='''Hours remaining in a job to run a reco; only those
reconstructions with time limits less than this will run. Specify <= 0
for no limit.'''
)
parser.add_argument(
'--use-locks',
action='store_true',
help='''EXPERIMENTAL (and flaky): Use file locking to protect files
from being processed by multiple separate processes.'''
)
args = parser.parse_args()
assert args.skip >= 0
if args.recos == 'all':
args.requested = list(range(len(RECOS)))
else:
args.requested = hrlist2list(args.recos)
num_inspecs = 0
if args.infile is not None:
args.infile = abspath(expand(args.infile))
if not isfile(args.infile):
raise IOError('`infile` "%s" is not a file.' % args.infile)
num_inspecs += 1
if args.indir is not None:
args.indir = abspath(expand(args.indir))
assert isdir(args.indir)
num_inspecs += 1
if num_inspecs != 1:
raise ValueError(
'Either --infile or --indir must be specified but not both.'
)
if args.infile is not None:
indir = abspath(dirname(expand(args.infile)))
else:
indir = abspath(expand(args.indir))
args.outdir = abspath(expand(args.outdir))
if args.outdir == indir:
raise ValueError(
'Outdir cannot be same as indir (or if infile is specified,'
' directory in which infile resides'
)
mkdir(args.outdir, warn=False)
assert isdir(args.outdir)
args.gcd = expand(args.gcd)
assert isfile(args.gcd)
if args.detector == 'pingu':
args.srt_pulse_name = 'newSRT_TW_Cleaned_WavedeformPulses'
args.geometry = 'pingu'
elif args.detector == 'deepcore':
args.srt_pulse_name = 'SRTTWOfflinePulsesDC'
args.geometry = 'deepcore'
if np.isinf(args.minutes_remaining):
if np.isinf(args.hours_remaining):
args.seconds_remaining = np.inf
else:
args.seconds_remaining = args.hours_remaining * 3600
else:
if not np.isinf(args.hours_remaining):
assert args.minutes_remaining == args.hours_remaining*60
args.seconds_remaining = args.minutes_remaining * 60
if args.seconds_remaining <= 0:
args.seconds_remaining = np.inf
args.seconds_remaining = int(np.ceil(np.clip(args.seconds_remaining,
a_min=0, a_max=31556926)))
return args
def main():
"""Main"""
start_time_sec = time.time()
args = parse_args()
def _sigint_handler(signal, frame): # pylint: disable=unused-argument, redefined-outer-name
wstderr('='*79 + '\n')
wstderr('*** CAUGHT CTL-C (sigint) *** ... attempting to cleanup!\n')
wstderr('='*79 + '\n')
raise KeyboardInterrupt
# Import IceCube things now
from I3Tray import I3Tray # pylint: disable=import-error
from icecube import dataclasses, dataio, icetray, multinest_icetray # pylint: disable=unused-variable, import-error
from cluster import get_spline_tables
lock_info = get_process_info()
wstdout('='*79 + '\n')
for d in [vars(args), lock_info]:
wstdout('\n')
wstdout(
'\n'.join([(('%20s'%k) + ' = %s'%d[k]) for k in sorted(d.keys())])
)
wstdout('\n'*2)
file_lister = FileLister(infile=args.infile, indir=args.indir)
event_counter = EventCounter(srt_pulse_name=args.srt_pulse_name,
skip=args.skip, n_events=args.n_events)
expiration = time.time() + args.seconds_remaining
expiration_timestamp = timestamp(at=expiration, utc=True)
while True:
infile_path = file_lister.get_next_file()
if infile_path is None:
wstdout('> No more files that can be processed. Quitting.\n')
break
# NOTE: cannot run on a file that has _all_ recos already run, since
# output file cannot be same as input file (which it will have same
# name, since the name is derived from recos run / etc.)
already_run = recos_from_path(infile_path)
# NOTE: now skipping a reco is determined ONLY by the "If" kwarg, and
# not by the filename at all (swap the comment on the next line for the
# line below to change behavior back)
#recos_not_run_yet = sorted(set(args.requested) - set(already_run))
recos_not_run_yet = sorted(set(args.requested))
if not recos_not_run_yet:
wstdout('> Nothing more to be done on file. Moving on. ("%s")\n'
% infile_path)
continue
# See if file still exists
if not isfile(infile_path):
wstdout('> File no longer exists. Moving on. ("%s")\n'
% infile_path)
continue
# Skip if empty input files
if getsize(infile_path) == 0:
wstdout('> Input file is 0-length. Moving on. ("%s")\n'
% infile_path)
continue
# NOTE: commenting out the following and forcing an extremely long
# timeout to allow all recos to run (of which many won't have to,
# becuase they've already been run). Uncomment the following three
# lines and comment out the "time_remaining =" line below to change the
# behavior back when most or all recos have to be run
#time_remaining = np.ceil(
# args.seconds_remaining - (time.time() - start_time_sec)
#)
time_remaining = 3600 * 24 * 10000
# See if any reco at all fits in the remaining time
if time_remaining <= MIN_RECO_TIME:
wstdout('Not enough time to run *any* reco. Quitting.\n')
break
# See if any of the recos needing to be run on *this* file fit in the
# remaining time; register all `reco_num`s that can be run
recos_to_run = []
after_proc_time_remaining = time_remaining
for reco_num in recos_not_run_yet:
time_limit = RECOS[reco_num]['time_limit']
if time_limit > after_proc_time_remaining:
continue
recos_to_run.append(reco_num)
after_proc_time_remaining -= time_limit
time_to_run_processing = time_remaining - after_proc_time_remaining
# Give the lock an extra minute beyond the strict time to run
expiration = time.time() + time_to_run_processing + 60
expiration_timestamp = timestamp(at=expiration, utc=True)
if not recos_to_run:
wstdout('Not enough time to run any remaining reco on file. Moving'
' on. ("%s")\n' % infile_path)
continue
infile_lock_f, outfile_lock_f = None, None
infile_lock_path = infile_path + LOCK_SUFFIX
outfile_lock_path = None
allrecos = set(recos_to_run).union(already_run)
outfile_name = basename(
path_from_recos(orig_path=infile_path, recos=allrecos)
)
outfile_path = abspath(expand(join(args.outdir, outfile_name)))
#print('args.outdir: "%s", outfile_name: "%s", outfile_path: "%s"'
# % (args.outdir, outfile_name, outfile_path))
#break # debug
outfile_lock_path = outfile_path + LOCK_SUFFIX
if outfile_name == infile_path or outfile_path == infile_path:
wstdout(
'Outfile is same as infile, which will lead to removal of'
' infile. Path = "%s" ; Moving on to next input file.\n'
% infile_path
)
continue
lock_info['acquired_at'] = timestamp(utc=True)
lock_info['expires_at'] = expiration_timestamp
lock_info['infile'] = infile_path
lock_info['outfile'] = outfile_path
if isfile(outfile_path):
wstdout('> Outfile path exists; will overwrite if both infile and'
' outfile locks can be obtained! ...\n'
'> "%s"\n' % outfile_path)
# NOTE:
# Create lockfiles (if they don't exist) for each of the infile and
# outfile, and try to acquire exclusive locks on these before
# working with either the infile or outfile.
#
# Also: write info to the lockfiles to know when it's okay to clean
# each up manually. Note that the `flock` will be removed by the OS
# as soon as the lock file is closed or when this process dies.
lock_info['type'] = 'infile_lock'
try:
if args.use_locks:
infile_lock_f = acquire_lock(infile_lock_path, lock_info)
except IOError:
wstdout(
'> infile lock failed to be obtained.'