def fix_interrupts(name): assert os.path.exists(name) and os.path.isdir(name) assert name.endswith('fail') queue_fail = QueueDir(name) queue_success = QueueDir(name.replace('fail', 'success')) restore_count = 0 queue_fail_size = queue_fail.qsize() fail_files = queue_fail.list_files() success_cache = {} for i in range(queue_success.qsize()): jd = queue_success.peek(i) key = jd['job_id'] jd_rec = {'jd': jd, 'id': i} success_cache[key] = jd_rec for i in range(queue_fail.qsize() - 1, -1, -1): jd = queue_fail.peek(i) if _has_output(name, jd): if jd['job_id'] in success_cache: print "WARN: already in success (%s)" % fail_files[i] continue print "seemsOK: %d" % jd['job_id'] restore_count += 1 queue_fail.remove(i) jd['ex_status'] = jd['status'] jd['status'] = 'SUCCESS' queue_success.put(jd) print "restored %d JDs of %d" % (restore_count, queue_fail_size)
def check_success(start_id, stop_id): pool = multiprocessing.Pool(POOL_SIZE) group_names = ["mc%02d" % i for i in range(1,21)] print group_names unsuccessful = pool.map(_find_no_output, group_names) unsuccessful = [x for x in unsuccessful if len(x) > 0] print unsuccessful with open("no_output.dump", 'w') as fh: cPickle.dump(unsuccessful, fh) for unx in unsuccessful: queue_succ_name = unx.keys()[0].split(':')[0] queue_succ = QueueDir(queue_succ_name) queue_fail = QueueDir(queue_succ_name.replace('success', 'fail')) for key in sorted(unx.keys(), key=lambda x: int(x.split(':')[1]), reverse=True): id = int(key.split(':')[1]) jd = unx[key] print "%s -> fail (%d)" % (key, jd['job_id']) queue_fail.put(jd) queue_succ.remove(id)