def work(server, task): sinfo = util.db_lookup_one(table='eval_solvers', kvs={"solver_id": task.solver_id}) pinfo = util.db_lookup_one(table='sat_problems', kvs={"problem_id": task.problem_id}) bbs = BlockBlobService(account_name=auth.store_name(), account_key=auth.store_key()) with tempfile.TemporaryDirectory() as tmpdir: tmpfilename = os.path.join(tmpdir, "%s.dimacs" % str(uuid.uuid4())) bbs.get_blob_to_path(pinfo['bcname'], pinfo['bname'], tmpfilename) outfilename = os.path.join(tmpdir, "results.out") status, n_secs_user, n_calls, n_fails, n_secs_gpu = call_solver( server=server, sinfo=sinfo, dimacs=tmpfilename, timeout_s=task.timeout_s, outfilename=outfilename) assert (status in ["UNSAT", "SAT", "UNKNOWN"]) util.db_insert(table="eval_problems", eval_id=task.eval_id, problem_id=task.problem_id, solver_id=task.solver_id, timeout_s=task.timeout_s, n_secs_user=n_secs_user, n_calls=n_calls, n_fails=n_fails, n_secs_gpu=n_secs_gpu, status=status) return None
def process_task_result(): while True: task, task_result = task_result_q.get() delete_blob(opts, bbs, task.bcnf) for btfd in task_result.btfds: tfd = from_blob(opts, bbs, btfd, delete=True) assert (tfd.n_vars > 0) assert (tfd.n_clauses > 0) dp_id = util.db_insert( table='gd_dps', gd_id=opts.gd_id, problem_id=task.id.problem_id, node_id=task.id.node_id, node_depth=task.id.node_depth, is_train=task.id.is_train, n_vars=tfd.n_vars, n_clauses=tfd.n_clauses, n_cells=np.shape(tfd.CL_idxs)[0], percent_vars_in_core=float( np.mean(tfd.core_var_mask.astype(np.float32))), percent_clauses_in_core=float( np.mean(tfd.core_clause_mask.astype(np.float32)))) tftdw.write_tftd(tftd=tfd_to_tftd( dp_id=dp_id, is_train=task.id.is_train, tfd=tfd))
def work(server, task): util.log(kind='info', author='eval-work', msg='starting on %d:%d:%s' % (task.problem_id, task.solver_id, server)) util.set_pyro_config() proxy = Pyro4.Proxy(server) util.log(kind='info', author='eval-work', msg='connected to %s' % server) def query(nm_args): try: args = NeuroSATArgs(n_vars=nm_args.n_vars, n_clauses=nm_args.n_clauses, CL_idxs=nm_args.CL_idxs) guesses = proxy.query(args) if guesses is None: return neurominisat.neurosat_failed_to_guess() else: return neurominisat.NeuroSATGuesses(n_secs_gpu=guesses['n_secs_gpu'], pi_core_var_logits=guesses['pi_core_var_logits']) except Exception as e: tb = traceback.format_exc() util.log(kind='error', author='query', msg="TASK: %s\n%s\n%s" % (str(task), str(e), tb)) return neurominisat.neurosat_failed_to_guess() sinfo = util.db_lookup_one(table='eval_solvers', kvs={"solver_id" : task.solver_id}) s = neurominisat.NeuroSolver(func=query, cfg=build_neurosat_config(sinfo)) pinfo = util.db_lookup_one(table='sat_problems', kvs={"problem_id" : task.problem_id}) bbs = BlockBlobService(account_name=auth.store_name(), account_key=auth.store_key()) with tempfile.TemporaryDirectory() as tmpdir: tmpfilename = os.path.join(tmpdir, "%s.dimacs" % str(uuid.uuid4())) bbs.get_blob_to_path(pinfo['bcname'], pinfo['bname'], tmpfilename) s.from_file(filename=tmpfilename) results = s.check_with_timeout_s(timeout_s=task.timeout_s) util.db_insert(table="eval_problems", eval_id=task.eval_id, problem_id=task.problem_id, solver_id=task.solver_id, timeout_s=task.timeout_s, n_secs_user=results.n_secs_user, n_secs_call=results.n_secs_call, n_secs_gpu=results.n_secs_gpu, status=str(results.status).split(".")[1]) return None
def evaluate_cdcl(opts): eval_id = util.db_insert(table='eval_runs', train_id=opts.train_id, checkpoint=opts.checkpoint, git_commit=util.get_commit(), n_gpus=opts.n_gpus, n_workers=opts.n_workers) tasks = [Task(eval_id=eval_id, problem_id=task['problem_id'], solver_id=task['solver_id'], timeout_s=opts.timeout_s) for task in util.db_lookup_many(query=opts.task_query)] util.log(kind='info', author='eval-head', msg='found %d tasks' % len(tasks), expensive=True) q = Queue() [q.put(task) for task in tasks] servers = ["%s:%s" % p for p in itertools.product(["PYRO:[email protected]%d" % i for i in range(1, 6)], ["909%d" % i for i in range(2, 6)])] util.log(kind='info', author='eval-head', msg="servers:\n%s" % str(servers)) server_to_n_jobs = { server : 0 for server in servers } job_to_server = {} job_to_task = {} jobs = [] while jobs or not q.empty(): try: while len(jobs) < opts.n_workers and not q.empty(): task = q.get() server = min(servers, key=(lambda server: server_to_n_jobs[server])) job = work.remote(server=server, task=task) server_to_n_jobs[server] += 1 job_to_server[job] = server job_to_task[job] = task jobs.append(job) if jobs: job = ray.wait(jobs, num_returns=1)[0][0] task = job_to_task[job] del job_to_task[job] server_to_n_jobs[job_to_server[job]] -= 1 del job_to_server[job] jobs.remove(job) try: _ = ray.get(job) except Exception as e: util.log(kind='error', author='eval-head', msg="RE-ENQUEUING TASK: %s" % str(task)) q.put(task) tb = traceback.format_exc() util.log(kind='error', author='eval-head', msg="ERROR: %s" % str(e)) util.log(kind='error', author='eval-head', msg="TRACEBACK: %s" % tb) except Exception as e: util.log(kind='error', author='eval-head', msg="OUTER-EXCEPTION: %s" % str(e))
def main(opts, cluster_spec, cfg): util.log(author='%s:%d' % (opts.job_name, opts.task_index), msg='starting @ %s' % util.get_hostname(expensive=True)) cluster = tf.train.ClusterSpec(cluster_spec) server = tf.train.Server(cluster, job_name=opts.job_name, task_index=opts.task_index) if opts.job_name == "ps": util.log(author='%s:%d' % (opts.job_name, opts.task_index), msg='joining server') server.join() raise Exception("Expecting server.join() to block forever") assert (opts.job_name == "worker") is_chief = (opts.task_index == 0) outqueue = Queue() train_post_thread = TrainPostThread(cfg, outqueue) train_post_thread.start() with tf.device("/job:ps/task:0"): params = NeuroSATParams(cfg=cfg) with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % opts.task_index, cluster=cluster)): filenames = [ os.path.join(util.gd_tfr_dir(gd_id=cfg['gd_id']), x) for x in os.listdir(util.gd_tfr_dir(gd_id=cfg['gd_id'])) ] dataset = tf.data.TFRecordDataset( filenames=filenames, compression_type="GZIP", num_parallel_reads=cfg['n_parallel_reads']) # TODO(dselsam): don't hardcode the number of shards # idea: extend cluster_spec to map (job, task) -> (n_shards, shard_idx) dataset = dataset.shard(num_shards=4, index=opts.task_index % 4) dataset = dataset.map(example_to_tftd, num_parallel_calls=cfg['n_parallel_calls']) dataset = dataset.filter( lambda tftd: 2 * tftd.n_vars + tftd.n_clauses < cfg['max_n_nodes']) dataset = dataset.repeat() dataset = dataset.prefetch(cfg['n_prefetch']) tftd = dataset.make_one_shot_iterator().get_next() args = NeuroSATArgs(n_vars=tftd.n_vars, n_clauses=tftd.n_clauses, CL_idxs=tftd.CL_idxs) guesses = apply_neurosat(cfg=cfg, params=params, args=args) pi_v_targets = tf.cast(tftd.core_var_mask, tf.float32) pi_v_targets = pi_v_targets / tf.reduce_sum(pi_v_targets) pi_c_targets = tf.cast(tftd.core_clause_mask, tf.float32) pi_c_targets = pi_c_targets / tf.reduce_sum(pi_c_targets) cv_loss = cfg['cv_loss_scale'] * tfutil.kldiv( logits=guesses.pi_core_var_logits, labels=pi_v_targets) cc_loss = cfg['cc_loss_scale'] * tfutil.kldiv( logits=guesses.pi_core_clause_logits, labels=pi_c_targets) l2_loss = cfg['l2_loss_scale'] * tfutil.build_l2_loss() loss = cv_loss + cc_loss + l2_loss stats = Stats(dp_id=tftd.dp_id, cv_loss=cv_loss, cc_loss=cc_loss, l2_loss=l2_loss) global_step = tf.train.get_or_create_global_step() learning_rate = tfutil.build_learning_rate(cfg, global_step) apply_grads = tf.cond( tftd.is_train, lambda: tfutil.build_apply_gradients( cfg, loss, learning_rate, global_step), lambda: True) util.log(author='%s:%d' % (opts.job_name, opts.task_index), msg='creating session (train_id=%d)...' % cfg['train_id']) with tf.train.MonitoredTrainingSession( master=server.target, is_chief=is_chief, checkpoint_dir=util.checkpoint_dir( train_id=cfg['train_id'])) as mon_sess: util.log(author='%s:%d' % (opts.job_name, opts.task_index), msg='starting session loop') step = 0 while True: try: (_, stats_v), n_secs = util.timeit(mon_sess.run, [apply_grads, stats]) outqueue.put( tuple(map(util.de_numpify, stats_v)) + (cfg['train_id'], n_secs)) step += 1 except tf.errors.ResourceExhaustedError as e: tb = traceback.format_exc() util.log(kind='error', author='train', msg="RESOURCE_EXHAUSTED\n%s\n%s" % (str(e), tb)) util.db_insert(table='tune_ooms', train_id=cfg['train_id']) except tf.errors.OpError as e: tb = traceback.format_exc() util.log(kind='error', author='train', msg="OP_ERROR\n%s\n%s" % (str(e), tb)) except Exception as e: tb = traceback.format_exc() util.log(kind='error', author='train', msg="EXCEPTION\n%s\n%s" % (str(e), tb)) except: tb = traceback.format_exc() util.log(kind='error', author='train', msg="UNKNOWN\n%s" % (tb))
parser.add_argument( "--cluster_spec_path", action="store", dest='cluster_spec_path', type=str, default='/home/dselsam/neurocore/configs/train/cluster_spec.json') opts = parser.parse_args() with open(opts.cluster_spec_path, 'r') as f: cluster_spec = json.load(f) with open(opts.cfg_path, 'r') as f: cfg = json.load(f) assert (os.path.exists(util.gd_tfr_dir(gd_id=cfg['gd_id']))) if opts.job_name == "ps" and opts.task_index == 0: cfg['train_id'] = util.db_insert(table='train_runs', **cfg, git_commit=util.get_commit()) else: time.sleep(5) cfg['train_id'] = util.db_query_one( 'select max(train_id) as train_id from train_runs')['train_id'] try: main(opts, cluster_spec=cluster_spec, cfg=cfg) except tf.errors.OpError as e: tb = traceback.format_exc() util.log(kind='error', author='train', msg="OP_ERROR\n%s\n%s" % (str(e), tb)) except Exception as e: tb = traceback.format_exc()
def gen_all_data(opts): tftdw = TFTDWriter(opts) tc = TaskIDCounter() bbs = BlockBlobService(account_name=auth.store_name(), account_key=auth.store_key()) task_pq = PriorityQueue() jobs = [] job_to_task = {} setattr( opts, 'gd_id', util.db_insert(table='gd_runs', git_commit=util.get_commit(), wait_n_secs=opts.wait_n_secs, n_jobs_at_once=opts.n_jobs_at_once, n_tfrs_per_file=opts.n_tfrs_per_file, max_n_nodes_train=opts.max_n_nodes_train, max_n_nodes_test=opts.max_n_nodes_test, find_max_tries=opts.find_max_tries, find_percent_to_keep=opts.find_percent_to_keep, query_limit=opts.limit, timeout_ms=opts.timeout_ms)) assert (not bbs.exists(util.gd_scratch_bcname(gd_id=opts.gd_id))) assert (not bbs.exists(util.gd_tfr_bcname(gd_id=opts.gd_id))) bbs.create_container(util.gd_scratch_bcname(gd_id=opts.gd_id)) bbs.create_container(util.gd_tfr_bcname(gd_id=opts.gd_id)) def launch_task(task): job = gen_data_for.remote(opts, task) jobs.append(job) job_to_task[job] = task def push_task(task, prio=None): if prio is None: prio = task.id.node_id task_pq.put_nowait((prio, task)) def reload_jobs(): while not task_pq.empty() and len(jobs) < opts.n_jobs_at_once: launch_task(task_pq.get_nowait()[1]) def push_problems(): util.log(author='push_problems', msg='starting') problem_infos = [] for is_train in [True, False]: conn = util._connect() try: with conn.cursor() as cursor: cursor.execute(mk_query(opts=opts, is_train=is_train)) problem_infos.extend([ (is_train, result) for result in list(cursor.fetchall_unbuffered()) ]) finally: conn.close() util.log(author='push_problems', msg='found %d problems' % len(problem_infos)) for is_train, info in problem_infos: with tempfile.TemporaryDirectory() as tmpdir: tmpfilename = os.path.join(tmpdir, "%s.dimacs" % str(uuid.uuid4())) bbs.get_blob_to_path(info['bcname'], info['bname'], tmpfilename) s = solver.Solver(solver.Context(), solver.Options()) s.from_file(tmpfilename) os.system('rm %s' % tmpfilename) task = Task(id=tc.fresh_id(info['problem_id'], is_train=is_train), bcnf=to_blob(opts, bbs, s.serialize())) assert (task.id.problem_id == info['problem_id']) push_task(task) util.log(author='push_problems', msg='pushed all problems') push_problems_thread = threading.Thread(target=push_problems, args=()) push_problems_thread.start() def get_ready_job(): while True: reload_jobs() if jobs: ready_jobs, _ = ray.wait(jobs, num_returns=1, timeout=opts.wait_n_secs) if ready_jobs: job = ready_jobs[0] jobs.remove(job) assert (job in job_to_task) task = job_to_task[job] del job_to_task[job] return job, task time.sleep(1) task_result_q = Queue() def process_task_result(): while True: task, task_result = task_result_q.get() delete_blob(opts, bbs, task.bcnf) for btfd in task_result.btfds: tfd = from_blob(opts, bbs, btfd, delete=True) assert (tfd.n_vars > 0) assert (tfd.n_clauses > 0) dp_id = util.db_insert( table='gd_dps', gd_id=opts.gd_id, problem_id=task.id.problem_id, node_id=task.id.node_id, node_depth=task.id.node_depth, is_train=task.id.is_train, n_vars=tfd.n_vars, n_clauses=tfd.n_clauses, n_cells=np.shape(tfd.CL_idxs)[0], percent_vars_in_core=float( np.mean(tfd.core_var_mask.astype(np.float32))), percent_clauses_in_core=float( np.mean(tfd.core_clause_mask.astype(np.float32)))) tftdw.write_tftd(tftd=tfd_to_tftd( dp_id=dp_id, is_train=task.id.is_train, tfd=tfd)) process_results_thread = threading.Thread(target=process_task_result, args=()) process_results_thread.start() try: while True: job, task = get_ready_job() try: task_result = ray.get(job) except Exception as e: tb = traceback.format_exc() util.log(kind='error', author='remote-worker', msg="TASK-ID: %s\n%s\n%s" % (str(task.id), str(e), tb)) push_task(task, prio=1000000) continue if task_result.new_bcnfs: child_ids = [ tc.next_child_id(task.id) for _ in task_result.new_bcnfs ] for child_id, child_bcnf in zip(child_ids, task_result.new_bcnfs): push_task(Task(id=child_id, bcnf=child_bcnf)) task_result_q.put((task, task_result)) except Exception as e: tb = traceback.format_exc() util.log(kind='error', author='master', msg="FAILING\n%s\n%s" % (str(e), tb)) print("Exception: ", e) print("Failing...") finally: print("Finally...") util.log(kind='info', author='master', msg="finalizing") tftdw.finalize() util.log(kind='info', author='master', msg="deleting scratch blob container") bbs.delete_container(util.gd_scratch_bcname(gd_id=opts.gd_id)) util.log(kind='info', author='master', msg="finished") print("All done!")