def dump(self, experiment_path): if exists(experiment_path): logger.warning("File exists: {}".format(experiment_path)) experiment_name = basename(experiment_path) folder = abspath(dirname(experiment_path)) problem_paths = [] for i, prob in enumerate(self.problems): problem_filename = Experiment.PROBLEM_TEMPL.format( experiment_name, i) problem_path = join(folder, problem_filename) prob.dump(problem_path) problem_paths.append(relpath(problem_path, folder)) index = {'problem_paths': problem_paths} if self.metadata is not None: index['metadata'] = self.metadata with open(experiment_path, 'wb') as f: pickle.dump(index, f) return experiment_path
def read(path): with open(path, 'rb') as f: index = pickle.load(f) folder = abspath(dirname(path)) model = Model.read(join(folder, index['model_path'])) datasets = {} for dataset_name, rel_path in index['dataset_paths'].items(): datasets[dataset_name] = Dataset.read(join(folder, rel_path)) bounds = bounds = index['bounds'] if 'bounds' in index else None learned_supports = [] metadata = None if 'support_paths' in index: for support_path in index['support_paths']: try: learned_supports.append( read_smtlib(join(folder, support_path))) except FileNotFoundError as e: logger.warning("Couldn't read: {}".format(path)) if 'metadata' in index: metadata = index['metadata'] problem = Problem(model, datasets, bounds=bounds, learned_supports=learned_supports, metadata=metadata) problem.original_path = path return problem
def dump(self, dataset_path): if exists(dataset_path): logger.warning("File exists: {}".format(dataset_path)) dataset_name = basename(dataset_path) feats_filename = Dataset.FEATS_TEMPL.format(dataset_name) data_filename = Dataset.DATA_TEMPL.format(dataset_name) constr_filename = Dataset.CONSTRAINTS_TEMPL.format(dataset_name) folder = abspath(dirname(dataset_path)) feats_path = join(folder, feats_filename) data_path = join(folder, data_filename) constr_path = join(folder, constr_filename) self.dump_feats(feats_path) self.dump_data(data_path) index = { 'feats_path': relpath(feats_path, folder), 'data_path': relpath(data_path, folder) } if self.constraints is not None: write_smtlib(self.constraints, constr_path) index['constr_path'] = relpath(constr_path, folder) with open(dataset_path, 'wb') as f: pickle.dump(index, f)
def dump_data(self, data_path): if exists(data_path): logger.warning("File exists: {}".format(data_path)) with open(data_path, 'w') as f: for row in self.data: str_row = ",".join(map(str, row))\ .replace("True","1")\ .replace("False","0") + "\n" f.write(str_row)
def run_experiment(experiment, learner, output_path, seed, n_samples, global_norm, timeout=None, discard_missing=True): if timeout is None: timeout = DEF_TIMEOUT results = [] n_discarded = 0 for i, problem in enumerate(experiment.problems): learned_models, evaluation = run_problem(problem, learner, seed, n_samples, timeout, global_norm) missing_data = (evaluation['gt-renorm'] == {} or evaluation['None'] == {} or evaluation['best'] is None) if discard_missing and missing_data: continue if learned_models is not None: j = 0 for t_mult, learned_model in learned_models: output_name = basename(output_path) folder = abspath(dirname(output_path)) model_name = output_name + "_{}_learned_{}".format(i, j) model_path = join(folder, model_name) learned_model.dump(model_path) evaluation[t_mult]['model_path'] = model_path j += 1 results.append(evaluation) n_dis = len(experiment.problems) - len(results) n_tot = len(experiment.problems) logger.info("Experiment done. Discarded {}/{}.".format(n_dis, n_tot)) if len(results) > 0: with open(output_path, 'wb') as f: pickle.dump(results, f) else: logger.warning("Nothing to dump!")
def dump_feats(self, feats_path): if exists(feats_path): logger.warning("File exists: {}".format(feats_path)) with open(feats_path, 'w') as f: for feature, str_type in self.features: # TODO?: MSPNs dataset have a list of continuous values that is not # written here if feature.symbol_type() == REAL: assert (str_type in ['continuous', 'discrete']) f.write("{}:{}.\n".format(feature.symbol_name(), str_type)) else: assert (str_type == 'categorical') f.write("{}:categorical:0,1.\n".format( feature.symbol_name()))
def support_generator(how_many, b_count, r_count, bias, k, lits, h, sample_count, ratio_percent, error_percent, seed): prefix = "random_support" ratio, errors = ratio_percent / 100, error_percent / 100 producer = Generator(b_count, r_count, bias, k, lits, h, sample_count, ratio, seed, prefix) supports = [] while len(supports) < how_many: try: chi = producer.generate_formula().support except RuntimeError: logger.warning("Runtime error while sampling the support") continue supports.append(chi) return supports
def dump(self, model_path): if exists(model_path): logger.warning("File exists: {}".format(model_path)) model_name = basename(model_path) support_filename = Model.SUPPORT_TEMPL.format(model_name) weightf_filename = Model.WEIGHTF_TEMPL.format(model_name) folder = abspath(dirname(model_path)) support_path = join(folder, support_filename) weightf_path = join(folder, weightf_filename) paths = [support_path, weightf_path] if any(exists(f) for f in paths): logger.warning("File(s) exist:\n" + "\n".join(paths)) write_smtlib(self.support, support_path) write_smtlib(self.weightfun, weightf_path) varlist = [(v.symbol_name(), v.symbol_type()) for v in self.get_vars()] index = { 'support_path': relpath(support_path, folder), 'weightf_path': relpath(weightf_path, folder), 'variables': varlist, 'bounds': self.bounds } if self.metadata is not None: index['metadata'] = self.metadata with open(model_path, 'wb') as f: pickle.dump(index, f)
def run_problem(problem, learner, seed, n_samples, timeout, global_norm, use_lariat=True): ground_truth = problem.model evaluation = dict() train = problem.datasets['train'] valid = problem.datasets['valid'] train_valid = Dataset(train.features, train.data + valid.data, train.constraints) if problem.learned_supports is not None: prior_supports = { problem.metadata['supports_metadata'][i]['support_threshold_mult']: chi for i, chi in enumerate(problem.learned_supports) } else: logger.warning("Couldn't find any learned support.") prior_supports = dict() prior_supports['None'] = None prior_supports['gt-renorm'] = ground_truth.support t_0 = time() learner.estimate_density(train, validation_data=valid) t_f = time() - t_0 logger.info("training time: {}".format(t_f)) evaluation['training_time'] = t_f learned_models = [] cached_models = dict() max_ll = None best = None logger.info("Evaluating:\n {}".format("\n".join( map(str, prior_supports.keys())))) for t_mult, prior_support in prior_supports.items(): if t_mult != 'None' and not use_lariat: continue evaluation[t_mult] = dict() ps_str = serialize(prior_support) if not isinstance(t_mult, str) else t_mult if ps_str in cached_models: learned_model, evaluation[t_mult] = cached_models[ps_str] else: try: logger.info( "--------------------------------------------------") logger.info("Support: {}".format(t_mult)) mode = RENORM_FULL if prior_support is not None else RENORM_OFF t_0 = time() learned_model, renormd = learner.renormalize( train, seed, mode=mode, support=prior_support, timeout=timeout, global_norm=global_norm) t_f = time() - t_0 if not renormd and prior_support is not None: continue evaluation[t_mult]['renorm_time'] = t_f except CalledProcessError as e: logger.warning("XADD error: {}".format(e)) continue except ModelException as e: logger.warning("Model error: {}".format(e)) continue logger.debug("Computing approx-IAE") iae = approx_IAE(learned_model, ground_truth, seed, n_samples) evaluation[t_mult]['approx-iae'] = iae logger.debug("Computing train-LL") train_ll, train_out = learned_model.log_likelihood(train) evaluation[t_mult]['train-ll'] = train_ll evaluation[t_mult]['train-out'] = train_out logger.debug("Computing valid-LL") valid_ll, valid_out = learned_model.log_likelihood(valid) evaluation[t_mult]['valid-ll'] = valid_ll evaluation[t_mult]['valid-out'] = valid_out train_valid_ll, train_valid_out = learned_model.log_likelihood( train_valid) evaluation[t_mult]['train-valid-ll'] = train_valid_ll evaluation[t_mult]['train-valid-out'] = train_valid_out if t_mult not in ['None','gt-renorm'] \ and (max_ll is None or valid_ll > max_ll): max_ll = valid_ll best = t_mult logger.debug("Computing volume difference") poly1 = Model(learned_model.support, None, ground_truth.get_vars(), ground_truth.bounds) poly2 = Model(ground_truth.support, None, ground_truth.get_vars(), ground_truth.bounds) vol_diff = ISE(poly1, poly2, seed, n_samples, engine='rej') evaluation[t_mult]['vol-diff'] = vol_diff cached_models[ps_str] = (learned_model, evaluation[t_mult]) domain = Domain.make( map(lambda v: v.symbol_name(), ground_truth.boolean_vars), learned_model.bounds) eval_falses = evaluate(domain, learned_model.support, np.asarray(train.data)) learned_models.append((t_mult, learned_model)) evaluation['best'] = best tmuls = sorted([ key for key in evaluation if key not in ['None', 'gt-renorm', 'training_time', 'best'] ]) eval_msg = """RESULTS: Training time: {} No renorm: {} GT renorm: {} Best chi : {} All chis: {} """.format(evaluation['training_time'], evaluation['None'], evaluation['gt-renorm'], (best, evaluation.get(best)), "\n".join([str((tmul, evaluation[tmul])) for tmul in tmuls])) logger.info(eval_msg) return learned_models, evaluation
mspn_parser.add_argument("--alpha", type=float, help="alpha?") mspn_parser.add_argument("--prior-weight", type=float, help="prior weight?") mspn_parser.add_argument("--leaf", choices=['piecewise', 'isotonic'], help="leaf?") mspn_parser.add_argument("--row-split", choices=['rdc-kmeans', 'gower'], help="row split?") args = parser.parse_args() # better check this first if exists(args.output_path): logger.warning("File exists: {}".format(args.output_path)) experiment = Experiment.read(args.experiment_path) learner_args = {} if args.seed: learner_args['seed'] = args.seed if args.action == 'det': if args.n_min: learner_args['n_min'] = args.n_min if args.n_max: learner_args['n_max'] = args.n_max if args.n_bins:
def renormalize(self, training_data, seed, mode=RENORM_OFF, support=None, timeout=None, global_norm=False): if timeout is None: timeout = DEF_RENORM_TIMEOUT detcopy = self.det.copy() model_support = detcopy.tree_to_WMI_support() model_weight = detcopy.tree_to_WMI_weightfun() bounds = { v.symbol_name(): b for v, b in detcopy.root.bounds.items() if v.symbol_type() == REAL } renorm_support = None if mode == RENORM_BG_ONLY and training_data.constraints is not None: renorm_support = training_data.constraints elif mode == RENORM_FULL: if training_data.constraints is not None and support is not None: renorm_support = training_data.constraints & support elif training_data.constraints is not None: renorm_support = training_data.constraints elif support is not None: renorm_support = support renormalized = False if renorm_support is not None: if global_norm: logger.debug("Global renormalization") model_support = model_support & renorm_support renormalized = True else: logger.debug("Local renormalization") def renorm_wrap(inst, support, support_path, weight_path): try: inst.renormalize(support) support = inst.tree_to_WMI_support() weight = inst.tree_to_WMI_weightfun() msg = "Writing result to files:\n{}\n{}" logger.debug(msg.format(support_path, weight_path)) write_smtlib(support, support_path) write_smtlib(weight, weight_path) logger.debug("Done.") except ModelException as e: logger.error( "Couldn't renormalize the DET: {}".format(e)) # communication with wrapper process through file # NEVER use multiprocessing.Queue with huge pysmt formulas rndstr = ''.join(choice(TMP_CHARS) for _ in range(TMP_LEN)) support_path = "{}.support".format(rndstr) weight_path = "{}.weight".format(rndstr) timed_proc = Process(target=renorm_wrap, args=(detcopy, renorm_support, support_path, weight_path)) logger.debug( "Starting renormalization with timeout: {}".format( timeout)) timed_proc.start() logger.debug("Timed proc started") timed_proc.join(timeout) logger.debug("Timed proc joined") if timed_proc.is_alive(): logger.warning("Renormalization timed out") pid = timed_proc.pid logger.warning( "Killing process {} and its children".format(pid)) kill_recursive(pid) else: try: model_support = read_smtlib(support_path) remove(support_path) except FileNotFoundError: model_support = None try: model_weight = read_smtlib(weight_path) remove(weight_path) except FileNotFoundError: model_weight = None if model_support is None or model_weight is None: raise ModelException("Couldn't renormalize the DET") logger.debug("Renormalization done") renormalized = True model = Model(model_support, model_weight, list(map(lambda x: x[0], training_data.features)), bounds, metadata=self.learner_args) # is Z = 1? if renormalized: check_Z_normalize(model, seed, TEST_AND_NORM_SAMPLES) elif not global_norm: # fallback strategy for local: to global model, renormalized = self.renormalize(training_data, seed, mode=mode, support=support, timeout=timeout, global_norm=True) return model, renormalized
def renormalize(self, training_data, seed, mode=RENORM_OFF, support=None, timeout=None, global_norm=True): if timeout is None: timeout = DEF_RENORM_TIMEOUT feature_dict = { var.symbol_name(): var for var, _ in training_data.features } model_weightfun, model_support = SPN_to_WMI(self.spn.root, feature_dict) bounds = {} for i, feat in enumerate(training_data.features): var = feat[0] if var.symbol_type() == REAL: xi = list(map(lambda row: row[i], training_data.data)) bounds[var.symbol_name()] = [min(xi), max(xi)] renorm_support = None if mode == RENORM_BG_ONLY and training_data.constraints is not None: renorm_support = training_data.constraints elif mode == RENORM_FULL: if training_data.constraints is not None and support is not None: renorm_support = training_data.constraints & support elif training_data.constraints is not None: renorm_support = training_data.constraints elif support is not None: renorm_support = support renormalized = False if renorm_support is not None: if global_norm: logger.debug("Global renormalization") model_support = model_support & renorm_support renormalized = True else: logger.debug("Local renormalization") domain = Domain.make([ v.symbol_name() for v, _ in training_data.features if v.symbol_type() == BOOL ], bounds) nc_model_support = normalize_formula(model_support) nc_model_weightfun = normalize_formula(model_weightfun) nc_renorm_support = normalize_formula(renorm_support) t_0 = time() xaddsolver = XaddEngine(domain, nc_model_support, nc_model_weightfun, mode="original", timeout=timeout) t_init = time() - t_0 logger.debug("XADDEngine t_init: {}".format(t_init)) try: res = xaddsolver.normalize(renorm_support) t_norm = time() - t_init except CalledProcessError as e: raise ModelException("CalledProcessError") if res is None: logger.warning("Timeout.") else: logger.debug("XADDEngine t_norm: {}".format(t_norm)) model_weightfun = get_env().formula_manager.normalize(res) model_support = get_env().formula_manager.normalize( And(model_support, renorm_support)) renormalized = True model = Model(model_support, model_weightfun, list(map(lambda x: x[0], training_data.features)), bounds, metadata=self.learner_args) if renormalized: check_Z_normalize(model, seed, TEST_AND_NORM_SAMPLES) elif not global_norm: # fallback strategy for local: to global model, renormalized = self.renormalize(training_data, seed, mode=mode, support=support, timeout=timeout, global_norm=True) return model, renormalized
from string import ascii_uppercase, digits from subprocess import CalledProcessError from time import time from wmilearn import logger from wmilearn.exceptions import ModelException from wmilearn.model import Model from wmilearn.det import DET from wmilearn.utils import check_Z_normalize MSPN_import_err = None try: from tfspn.SPN import SPN, Splitting from wmilearn.conversions import SPN_to_WMI except ImportError as e: logger.warning("Couldn't import the MSPN library: " + str(e)) MSPN_import_err = e def kill_recursive(pid): proc = psutil.Process(pid) for subproc in proc.children(recursive=True): try: subproc.kill() except psutil.NoSuchProcess: continue try: proc.kill() except psutil.NoSuchProcess: pass
chi, k, h, thresholds, threshold_mult = res learned_supports.append(chi) metadata = dict() metadata['support_k'] = k metadata['support_h'] = h metadata['support_seed'] = args.seed metadata['support_thresholds'] = thresholds metadata['support_threshold_mult'] = threshold_mult supports_metadata.append(metadata) if len(learned_supports) == 0: # try projecting on the continuous subspace logger.warning( "No support learned on the full space. Projecting..") numerical_vars = [ v for v, s in train_valid.features if s in ["continuous", "discrete"] ] projected_train_valid = train_valid.project(numerical_vars) for res in learn_supports_adaptive( projected_train_valid, args.seed, timeout=timeout, bg_knowledge=train.constraints, negative_bootstrap=args.negative_bootstrap): chi, k, h, thresholds, threshold_mult = res learned_supports.append(chi)