示例#1
0
文件: qmgr.py 项目: anaderi/skygrid
def fix_interrupts(name):
    assert os.path.exists(name) and os.path.isdir(name)
    assert name.endswith('fail')
    queue_fail = QueueDir(name)
    queue_success = QueueDir(name.replace('fail', 'success'))
    restore_count = 0
    queue_fail_size = queue_fail.qsize()
    fail_files = queue_fail.list_files()

    success_cache = {}
    for i in range(queue_success.qsize()):
        jd = queue_success.peek(i)
        key = jd['job_id']
        jd_rec = {'jd': jd, 'id': i}
        success_cache[key] = jd_rec

    for i in range(queue_fail.qsize() - 1, -1, -1):
        jd = queue_fail.peek(i)
        if _has_output(name, jd):
            if jd['job_id'] in success_cache:
                print "WARN: already in success (%s)" % fail_files[i]
                continue
            print "seemsOK: %d" % jd['job_id']
            restore_count += 1
            queue_fail.remove(i)
            jd['ex_status'] = jd['status']
            jd['status'] = 'SUCCESS'
            queue_success.put(jd)
    print "restored %d JDs of %d" % (restore_count, queue_fail_size)
示例#2
0
文件: qmgr.py 项目: anaderi/skygrid
def check_success(start_id, stop_id):
    pool = multiprocessing.Pool(POOL_SIZE)
    group_names = ["mc%02d" % i for i in range(1,21)]
    print group_names
    unsuccessful = pool.map(_find_no_output, group_names)
    unsuccessful = [x for x in unsuccessful if len(x) > 0]
    print unsuccessful
    with open("no_output.dump", 'w') as fh:
        cPickle.dump(unsuccessful, fh)
    for unx in unsuccessful:
        queue_succ_name = unx.keys()[0].split(':')[0]
        queue_succ = QueueDir(queue_succ_name)
        queue_fail = QueueDir(queue_succ_name.replace('success', 'fail'))
        for key in sorted(unx.keys(), key=lambda x: int(x.split(':')[1]), reverse=True):
            id = int(key.split(':')[1])
            jd = unx[key]
            print "%s -> fail (%d)" % (key, jd['job_id'])
            queue_fail.put(jd)
            queue_succ.remove(id)