Exemplo n.º 1
0
def work(server, task):
    sinfo = util.db_lookup_one(table='eval_solvers',
                               kvs={"solver_id": task.solver_id})
    pinfo = util.db_lookup_one(table='sat_problems',
                               kvs={"problem_id": task.problem_id})
    bbs = BlockBlobService(account_name=auth.store_name(),
                           account_key=auth.store_key())
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpfilename = os.path.join(tmpdir, "%s.dimacs" % str(uuid.uuid4()))
        bbs.get_blob_to_path(pinfo['bcname'], pinfo['bname'], tmpfilename)
        outfilename = os.path.join(tmpdir, "results.out")
        status, n_secs_user, n_calls, n_fails, n_secs_gpu = call_solver(
            server=server,
            sinfo=sinfo,
            dimacs=tmpfilename,
            timeout_s=task.timeout_s,
            outfilename=outfilename)

    assert (status in ["UNSAT", "SAT", "UNKNOWN"])

    util.db_insert(table="eval_problems",
                   eval_id=task.eval_id,
                   problem_id=task.problem_id,
                   solver_id=task.solver_id,
                   timeout_s=task.timeout_s,
                   n_secs_user=n_secs_user,
                   n_calls=n_calls,
                   n_fails=n_fails,
                   n_secs_gpu=n_secs_gpu,
                   status=status)
    return None
Exemplo n.º 2
0
    def process_task_result():
        while True:
            task, task_result = task_result_q.get()
            delete_blob(opts, bbs, task.bcnf)

            for btfd in task_result.btfds:
                tfd = from_blob(opts, bbs, btfd, delete=True)
                assert (tfd.n_vars > 0)
                assert (tfd.n_clauses > 0)
                dp_id = util.db_insert(
                    table='gd_dps',
                    gd_id=opts.gd_id,
                    problem_id=task.id.problem_id,
                    node_id=task.id.node_id,
                    node_depth=task.id.node_depth,
                    is_train=task.id.is_train,
                    n_vars=tfd.n_vars,
                    n_clauses=tfd.n_clauses,
                    n_cells=np.shape(tfd.CL_idxs)[0],
                    percent_vars_in_core=float(
                        np.mean(tfd.core_var_mask.astype(np.float32))),
                    percent_clauses_in_core=float(
                        np.mean(tfd.core_clause_mask.astype(np.float32))))
                tftdw.write_tftd(tftd=tfd_to_tftd(
                    dp_id=dp_id, is_train=task.id.is_train, tfd=tfd))
Exemplo n.º 3
0
def work(server, task):
    util.log(kind='info', author='eval-work', msg='starting on %d:%d:%s' % (task.problem_id, task.solver_id, server))
    util.set_pyro_config()
    proxy = Pyro4.Proxy(server)
    util.log(kind='info', author='eval-work', msg='connected to %s' % server)

    def query(nm_args):
        try:
            args    = NeuroSATArgs(n_vars=nm_args.n_vars, n_clauses=nm_args.n_clauses, CL_idxs=nm_args.CL_idxs)
            guesses = proxy.query(args)
            if guesses is None:
                return neurominisat.neurosat_failed_to_guess()
            else:
                return neurominisat.NeuroSATGuesses(n_secs_gpu=guesses['n_secs_gpu'],
                                                    pi_core_var_logits=guesses['pi_core_var_logits'])
        except Exception as e:
            tb = traceback.format_exc()
            util.log(kind='error', author='query', msg="TASK: %s\n%s\n%s" % (str(task), str(e), tb))
            return neurominisat.neurosat_failed_to_guess()

    sinfo = util.db_lookup_one(table='eval_solvers', kvs={"solver_id" : task.solver_id})
    s     = neurominisat.NeuroSolver(func=query, cfg=build_neurosat_config(sinfo))
    pinfo = util.db_lookup_one(table='sat_problems', kvs={"problem_id" : task.problem_id})
    bbs   = BlockBlobService(account_name=auth.store_name(), account_key=auth.store_key())
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpfilename = os.path.join(tmpdir, "%s.dimacs" % str(uuid.uuid4()))
        bbs.get_blob_to_path(pinfo['bcname'], pinfo['bname'], tmpfilename)
        s.from_file(filename=tmpfilename)

    results  = s.check_with_timeout_s(timeout_s=task.timeout_s)
    util.db_insert(table="eval_problems",
                   eval_id=task.eval_id,
                   problem_id=task.problem_id,
                   solver_id=task.solver_id,
                   timeout_s=task.timeout_s,
                   n_secs_user=results.n_secs_user,
                   n_secs_call=results.n_secs_call,
                   n_secs_gpu=results.n_secs_gpu,                                      
                   status=str(results.status).split(".")[1])
    return None
Exemplo n.º 4
0
def evaluate_cdcl(opts):
    eval_id = util.db_insert(table='eval_runs', train_id=opts.train_id, checkpoint=opts.checkpoint,
                             git_commit=util.get_commit(), n_gpus=opts.n_gpus, n_workers=opts.n_workers)
    tasks   = [Task(eval_id=eval_id, problem_id=task['problem_id'], solver_id=task['solver_id'], timeout_s=opts.timeout_s)
               for task in util.db_lookup_many(query=opts.task_query)]

    util.log(kind='info', author='eval-head', msg='found %d tasks' % len(tasks), expensive=True)

    q = Queue()
    [q.put(task) for task in tasks]

    servers           = ["%s:%s" % p
                         for p in itertools.product(["PYRO:[email protected]%d" % i for i in range(1, 6)],
                                                    ["909%d" % i for i in range(2, 6)])]

    util.log(kind='info', author='eval-head', msg="servers:\n%s" % str(servers))
    server_to_n_jobs  = { server : 0 for server in servers }
    job_to_server     = {}
    job_to_task       = {}
    jobs              = []

    while jobs or not q.empty():
        try:
            while len(jobs) < opts.n_workers and not q.empty():
                task                       = q.get()
                server                     = min(servers, key=(lambda server: server_to_n_jobs[server]))
                job                        = work.remote(server=server, task=task)
                server_to_n_jobs[server]  += 1
                job_to_server[job]         = server
                job_to_task[job]           = task
                jobs.append(job)

            if jobs:
                job  = ray.wait(jobs, num_returns=1)[0][0]
                task = job_to_task[job]
                del job_to_task[job]
                server_to_n_jobs[job_to_server[job]] -= 1
                del job_to_server[job]
                jobs.remove(job)

                try:
                    _ = ray.get(job)
                except Exception as e:
                    util.log(kind='error', author='eval-head', msg="RE-ENQUEUING TASK: %s" % str(task))
                    q.put(task)
                    tb = traceback.format_exc()
                    util.log(kind='error', author='eval-head', msg="ERROR: %s" % str(e))
                    util.log(kind='error', author='eval-head', msg="TRACEBACK: %s" % tb)
        except Exception as e:
            util.log(kind='error', author='eval-head', msg="OUTER-EXCEPTION: %s" % str(e))
Exemplo n.º 5
0
def main(opts, cluster_spec, cfg):
    util.log(author='%s:%d' % (opts.job_name, opts.task_index),
             msg='starting @ %s' % util.get_hostname(expensive=True))
    cluster = tf.train.ClusterSpec(cluster_spec)
    server = tf.train.Server(cluster,
                             job_name=opts.job_name,
                             task_index=opts.task_index)

    if opts.job_name == "ps":
        util.log(author='%s:%d' % (opts.job_name, opts.task_index),
                 msg='joining server')
        server.join()
        raise Exception("Expecting server.join() to block forever")

    assert (opts.job_name == "worker")
    is_chief = (opts.task_index == 0)

    outqueue = Queue()
    train_post_thread = TrainPostThread(cfg, outqueue)
    train_post_thread.start()

    with tf.device("/job:ps/task:0"):
        params = NeuroSATParams(cfg=cfg)

    with tf.device(
            tf.train.replica_device_setter(
                worker_device="/job:worker/task:%d" % opts.task_index,
                cluster=cluster)):
        filenames = [
            os.path.join(util.gd_tfr_dir(gd_id=cfg['gd_id']), x)
            for x in os.listdir(util.gd_tfr_dir(gd_id=cfg['gd_id']))
        ]
        dataset = tf.data.TFRecordDataset(
            filenames=filenames,
            compression_type="GZIP",
            num_parallel_reads=cfg['n_parallel_reads'])
        # TODO(dselsam): don't hardcode the number of shards
        # idea: extend cluster_spec to map (job, task) -> (n_shards, shard_idx)
        dataset = dataset.shard(num_shards=4, index=opts.task_index % 4)
        dataset = dataset.map(example_to_tftd,
                              num_parallel_calls=cfg['n_parallel_calls'])
        dataset = dataset.filter(
            lambda tftd: 2 * tftd.n_vars + tftd.n_clauses < cfg['max_n_nodes'])
        dataset = dataset.repeat()
        dataset = dataset.prefetch(cfg['n_prefetch'])

        tftd = dataset.make_one_shot_iterator().get_next()

        args = NeuroSATArgs(n_vars=tftd.n_vars,
                            n_clauses=tftd.n_clauses,
                            CL_idxs=tftd.CL_idxs)
        guesses = apply_neurosat(cfg=cfg, params=params, args=args)

        pi_v_targets = tf.cast(tftd.core_var_mask, tf.float32)
        pi_v_targets = pi_v_targets / tf.reduce_sum(pi_v_targets)

        pi_c_targets = tf.cast(tftd.core_clause_mask, tf.float32)
        pi_c_targets = pi_c_targets / tf.reduce_sum(pi_c_targets)

        cv_loss = cfg['cv_loss_scale'] * tfutil.kldiv(
            logits=guesses.pi_core_var_logits, labels=pi_v_targets)
        cc_loss = cfg['cc_loss_scale'] * tfutil.kldiv(
            logits=guesses.pi_core_clause_logits, labels=pi_c_targets)
        l2_loss = cfg['l2_loss_scale'] * tfutil.build_l2_loss()
        loss = cv_loss + cc_loss + l2_loss

        stats = Stats(dp_id=tftd.dp_id,
                      cv_loss=cv_loss,
                      cc_loss=cc_loss,
                      l2_loss=l2_loss)

        global_step = tf.train.get_or_create_global_step()
        learning_rate = tfutil.build_learning_rate(cfg, global_step)

        apply_grads = tf.cond(
            tftd.is_train, lambda: tfutil.build_apply_gradients(
                cfg, loss, learning_rate, global_step), lambda: True)

    util.log(author='%s:%d' % (opts.job_name, opts.task_index),
             msg='creating session (train_id=%d)...' % cfg['train_id'])
    with tf.train.MonitoredTrainingSession(
            master=server.target,
            is_chief=is_chief,
            checkpoint_dir=util.checkpoint_dir(
                train_id=cfg['train_id'])) as mon_sess:
        util.log(author='%s:%d' % (opts.job_name, opts.task_index),
                 msg='starting session loop')
        step = 0
        while True:
            try:
                (_, stats_v), n_secs = util.timeit(mon_sess.run,
                                                   [apply_grads, stats])
                outqueue.put(
                    tuple(map(util.de_numpify, stats_v)) +
                    (cfg['train_id'], n_secs))
                step += 1
            except tf.errors.ResourceExhaustedError as e:
                tb = traceback.format_exc()
                util.log(kind='error',
                         author='train',
                         msg="RESOURCE_EXHAUSTED\n%s\n%s" % (str(e), tb))
                util.db_insert(table='tune_ooms', train_id=cfg['train_id'])
            except tf.errors.OpError as e:
                tb = traceback.format_exc()
                util.log(kind='error',
                         author='train',
                         msg="OP_ERROR\n%s\n%s" % (str(e), tb))
            except Exception as e:
                tb = traceback.format_exc()
                util.log(kind='error',
                         author='train',
                         msg="EXCEPTION\n%s\n%s" % (str(e), tb))
            except:
                tb = traceback.format_exc()
                util.log(kind='error',
                         author='train',
                         msg="UNKNOWN\n%s" % (tb))
Exemplo n.º 6
0
    parser.add_argument(
        "--cluster_spec_path",
        action="store",
        dest='cluster_spec_path',
        type=str,
        default='/home/dselsam/neurocore/configs/train/cluster_spec.json')
    opts = parser.parse_args()
    with open(opts.cluster_spec_path, 'r') as f:
        cluster_spec = json.load(f)
    with open(opts.cfg_path, 'r') as f:
        cfg = json.load(f)
    assert (os.path.exists(util.gd_tfr_dir(gd_id=cfg['gd_id'])))

    if opts.job_name == "ps" and opts.task_index == 0:
        cfg['train_id'] = util.db_insert(table='train_runs',
                                         **cfg,
                                         git_commit=util.get_commit())
    else:
        time.sleep(5)
        cfg['train_id'] = util.db_query_one(
            'select max(train_id) as train_id from train_runs')['train_id']

    try:
        main(opts, cluster_spec=cluster_spec, cfg=cfg)
    except tf.errors.OpError as e:
        tb = traceback.format_exc()
        util.log(kind='error',
                 author='train',
                 msg="OP_ERROR\n%s\n%s" % (str(e), tb))
    except Exception as e:
        tb = traceback.format_exc()
Exemplo n.º 7
0
def gen_all_data(opts):
    tftdw = TFTDWriter(opts)
    tc = TaskIDCounter()
    bbs = BlockBlobService(account_name=auth.store_name(),
                           account_key=auth.store_key())
    task_pq = PriorityQueue()
    jobs = []
    job_to_task = {}

    setattr(
        opts, 'gd_id',
        util.db_insert(table='gd_runs',
                       git_commit=util.get_commit(),
                       wait_n_secs=opts.wait_n_secs,
                       n_jobs_at_once=opts.n_jobs_at_once,
                       n_tfrs_per_file=opts.n_tfrs_per_file,
                       max_n_nodes_train=opts.max_n_nodes_train,
                       max_n_nodes_test=opts.max_n_nodes_test,
                       find_max_tries=opts.find_max_tries,
                       find_percent_to_keep=opts.find_percent_to_keep,
                       query_limit=opts.limit,
                       timeout_ms=opts.timeout_ms))

    assert (not bbs.exists(util.gd_scratch_bcname(gd_id=opts.gd_id)))
    assert (not bbs.exists(util.gd_tfr_bcname(gd_id=opts.gd_id)))

    bbs.create_container(util.gd_scratch_bcname(gd_id=opts.gd_id))
    bbs.create_container(util.gd_tfr_bcname(gd_id=opts.gd_id))

    def launch_task(task):
        job = gen_data_for.remote(opts, task)
        jobs.append(job)
        job_to_task[job] = task

    def push_task(task, prio=None):
        if prio is None: prio = task.id.node_id
        task_pq.put_nowait((prio, task))

    def reload_jobs():
        while not task_pq.empty() and len(jobs) < opts.n_jobs_at_once:
            launch_task(task_pq.get_nowait()[1])

    def push_problems():
        util.log(author='push_problems', msg='starting')
        problem_infos = []
        for is_train in [True, False]:
            conn = util._connect()
            try:
                with conn.cursor() as cursor:
                    cursor.execute(mk_query(opts=opts, is_train=is_train))
                    problem_infos.extend([
                        (is_train, result)
                        for result in list(cursor.fetchall_unbuffered())
                    ])
            finally:
                conn.close()
        util.log(author='push_problems',
                 msg='found %d problems' % len(problem_infos))

        for is_train, info in problem_infos:
            with tempfile.TemporaryDirectory() as tmpdir:
                tmpfilename = os.path.join(tmpdir,
                                           "%s.dimacs" % str(uuid.uuid4()))
                bbs.get_blob_to_path(info['bcname'], info['bname'],
                                     tmpfilename)
                s = solver.Solver(solver.Context(), solver.Options())
                s.from_file(tmpfilename)
                os.system('rm %s' % tmpfilename)
                task = Task(id=tc.fresh_id(info['problem_id'],
                                           is_train=is_train),
                            bcnf=to_blob(opts, bbs, s.serialize()))
                assert (task.id.problem_id == info['problem_id'])
                push_task(task)

        util.log(author='push_problems', msg='pushed all problems')

    push_problems_thread = threading.Thread(target=push_problems, args=())
    push_problems_thread.start()

    def get_ready_job():
        while True:
            reload_jobs()
            if jobs:
                ready_jobs, _ = ray.wait(jobs,
                                         num_returns=1,
                                         timeout=opts.wait_n_secs)
                if ready_jobs:
                    job = ready_jobs[0]
                    jobs.remove(job)
                    assert (job in job_to_task)
                    task = job_to_task[job]
                    del job_to_task[job]
                    return job, task
            time.sleep(1)

    task_result_q = Queue()

    def process_task_result():
        while True:
            task, task_result = task_result_q.get()
            delete_blob(opts, bbs, task.bcnf)

            for btfd in task_result.btfds:
                tfd = from_blob(opts, bbs, btfd, delete=True)
                assert (tfd.n_vars > 0)
                assert (tfd.n_clauses > 0)
                dp_id = util.db_insert(
                    table='gd_dps',
                    gd_id=opts.gd_id,
                    problem_id=task.id.problem_id,
                    node_id=task.id.node_id,
                    node_depth=task.id.node_depth,
                    is_train=task.id.is_train,
                    n_vars=tfd.n_vars,
                    n_clauses=tfd.n_clauses,
                    n_cells=np.shape(tfd.CL_idxs)[0],
                    percent_vars_in_core=float(
                        np.mean(tfd.core_var_mask.astype(np.float32))),
                    percent_clauses_in_core=float(
                        np.mean(tfd.core_clause_mask.astype(np.float32))))
                tftdw.write_tftd(tftd=tfd_to_tftd(
                    dp_id=dp_id, is_train=task.id.is_train, tfd=tfd))

    process_results_thread = threading.Thread(target=process_task_result,
                                              args=())
    process_results_thread.start()

    try:
        while True:
            job, task = get_ready_job()
            try:
                task_result = ray.get(job)
            except Exception as e:
                tb = traceback.format_exc()
                util.log(kind='error',
                         author='remote-worker',
                         msg="TASK-ID: %s\n%s\n%s" %
                         (str(task.id), str(e), tb))
                push_task(task, prio=1000000)
                continue

            if task_result.new_bcnfs:
                child_ids = [
                    tc.next_child_id(task.id) for _ in task_result.new_bcnfs
                ]
                for child_id, child_bcnf in zip(child_ids,
                                                task_result.new_bcnfs):
                    push_task(Task(id=child_id, bcnf=child_bcnf))

            task_result_q.put((task, task_result))

    except Exception as e:
        tb = traceback.format_exc()
        util.log(kind='error',
                 author='master',
                 msg="FAILING\n%s\n%s" % (str(e), tb))
        print("Exception: ", e)
        print("Failing...")
    finally:
        print("Finally...")
        util.log(kind='info', author='master', msg="finalizing")
        tftdw.finalize()
        util.log(kind='info',
                 author='master',
                 msg="deleting scratch blob container")
        bbs.delete_container(util.gd_scratch_bcname(gd_id=opts.gd_id))
        util.log(kind='info', author='master', msg="finished")
        print("All done!")