def learn_supports_adaptive(dataset, seed, bg_knowledge=None, timeout=None, initial=None, mult=None, hops=None, max_mult=None, negative_bootstrap=None): if timeout is None: timeout = DEF_TIMEOUT if initial is None: initial = DEF_INITIAL if mult is None: mult = DEF_MULT if hops is None: hops = DEF_HOPS if max_mult is None: max_mult = DEF_MAX_MULT results = [] discovered = set() t_mults = set() last = initial i = 0 msg = "Adaptive support learning. timeout = {}, init = {}, mult = {}, hops = {}" logger.info(msg.format(timeout, initial, mult, hops)) while i < hops and last < max_mult: logger.debug("i: {} last: {}".format(i, last)) t_mults.add(last) res = learn_support(dataset, seed, last, timeout=timeout, bg_knowledge=bg_knowledge, symmetry_breaking="mvn", negative_bootstrap=negative_bootstrap) if res is not None: chi, k, h, thresholds = res chistr = serialize(chi) smaller = {t for t in t_mults if t < last} if chistr not in discovered: discovered.add(chistr) results.append(res + (last,)) if len(smaller) > 0: last = (last + max(smaller)) / 2 i += 1 else: last = last / mult else: # last t_mult timed out larger = {t for t in t_mults if t > last} if len(larger) > 0: last = (last + min(larger)) / 2 i += 1 else: last = last * mult return results
def run_experiment(experiment, learner, output_path, seed, n_samples, global_norm, timeout=None, discard_missing=True): if timeout is None: timeout = DEF_TIMEOUT results = [] n_discarded = 0 for i, problem in enumerate(experiment.problems): learned_models, evaluation = run_problem(problem, learner, seed, n_samples, timeout, global_norm) missing_data = (evaluation['gt-renorm'] == {} or evaluation['None'] == {} or evaluation['best'] is None) if discard_missing and missing_data: continue if learned_models is not None: j = 0 for t_mult, learned_model in learned_models: output_name = basename(output_path) folder = abspath(dirname(output_path)) model_name = output_name + "_{}_learned_{}".format(i, j) model_path = join(folder, model_name) learned_model.dump(model_path) evaluation[t_mult]['model_path'] = model_path j += 1 results.append(evaluation) n_dis = len(experiment.problems) - len(results) n_tot = len(experiment.problems) logger.info("Experiment done. Discarded {}/{}.".format(n_dis, n_tot)) if len(results) > 0: with open(output_path, 'wb') as f: pickle.dump(results, f) else: logger.warning("Nothing to dump!")
def estimate_density(self, training_data, validation_data=None): """Fit a DET on the training data. If the optional validation data is provided, it is used to prune the tree.""" self.det = DET(**{ k: v for k, v in self.learner_args.items() if k in ['n_min', 'n_max'] }) logger.info("Growing the full tree") self.det.grow_full_tree(training_data) logger.info(self.det.info()) logger.info("Pruning the full tree") if validation_data is None: if 'n_bins' in self.learner_args: self.det.prune_with_cv(training_data, n_bins=self.learner_args['n_bins']) else: self.det.prune_with_cv(training_data) else: self.det.prune_with_validation(validation_data) logger.info(self.det.info())
def run_problem(problem, learner, seed, n_samples, timeout, global_norm, use_lariat=True): ground_truth = problem.model evaluation = dict() train = problem.datasets['train'] valid = problem.datasets['valid'] train_valid = Dataset(train.features, train.data + valid.data, train.constraints) if problem.learned_supports is not None: prior_supports = { problem.metadata['supports_metadata'][i]['support_threshold_mult']: chi for i, chi in enumerate(problem.learned_supports) } else: logger.warning("Couldn't find any learned support.") prior_supports = dict() prior_supports['None'] = None prior_supports['gt-renorm'] = ground_truth.support t_0 = time() learner.estimate_density(train, validation_data=valid) t_f = time() - t_0 logger.info("training time: {}".format(t_f)) evaluation['training_time'] = t_f learned_models = [] cached_models = dict() max_ll = None best = None logger.info("Evaluating:\n {}".format("\n".join( map(str, prior_supports.keys())))) for t_mult, prior_support in prior_supports.items(): if t_mult != 'None' and not use_lariat: continue evaluation[t_mult] = dict() ps_str = serialize(prior_support) if not isinstance(t_mult, str) else t_mult if ps_str in cached_models: learned_model, evaluation[t_mult] = cached_models[ps_str] else: try: logger.info( "--------------------------------------------------") logger.info("Support: {}".format(t_mult)) mode = RENORM_FULL if prior_support is not None else RENORM_OFF t_0 = time() learned_model, renormd = learner.renormalize( train, seed, mode=mode, support=prior_support, timeout=timeout, global_norm=global_norm) t_f = time() - t_0 if not renormd and prior_support is not None: continue evaluation[t_mult]['renorm_time'] = t_f except CalledProcessError as e: logger.warning("XADD error: {}".format(e)) continue except ModelException as e: logger.warning("Model error: {}".format(e)) continue logger.debug("Computing approx-IAE") iae = approx_IAE(learned_model, ground_truth, seed, n_samples) evaluation[t_mult]['approx-iae'] = iae logger.debug("Computing train-LL") train_ll, train_out = learned_model.log_likelihood(train) evaluation[t_mult]['train-ll'] = train_ll evaluation[t_mult]['train-out'] = train_out logger.debug("Computing valid-LL") valid_ll, valid_out = learned_model.log_likelihood(valid) evaluation[t_mult]['valid-ll'] = valid_ll evaluation[t_mult]['valid-out'] = valid_out train_valid_ll, train_valid_out = learned_model.log_likelihood( train_valid) evaluation[t_mult]['train-valid-ll'] = train_valid_ll evaluation[t_mult]['train-valid-out'] = train_valid_out if t_mult not in ['None','gt-renorm'] \ and (max_ll is None or valid_ll > max_ll): max_ll = valid_ll best = t_mult logger.debug("Computing volume difference") poly1 = Model(learned_model.support, None, ground_truth.get_vars(), ground_truth.bounds) poly2 = Model(ground_truth.support, None, ground_truth.get_vars(), ground_truth.bounds) vol_diff = ISE(poly1, poly2, seed, n_samples, engine='rej') evaluation[t_mult]['vol-diff'] = vol_diff cached_models[ps_str] = (learned_model, evaluation[t_mult]) domain = Domain.make( map(lambda v: v.symbol_name(), ground_truth.boolean_vars), learned_model.bounds) eval_falses = evaluate(domain, learned_model.support, np.asarray(train.data)) learned_models.append((t_mult, learned_model)) evaluation['best'] = best tmuls = sorted([ key for key in evaluation if key not in ['None', 'gt-renorm', 'training_time', 'best'] ]) eval_msg = """RESULTS: Training time: {} No renorm: {} GT renorm: {} Best chi : {} All chis: {} """.format(evaluation['training_time'], evaluation['None'], evaluation['gt-renorm'], (best, evaluation.get(best)), "\n".join([str((tmul, evaluation[tmul])) for tmul in tmuls])) logger.info(eval_msg) return learned_models, evaluation
learner_args['n_bins'] = args.n_bins learner = DETLearner(learner_args) elif args.action == 'mspn': if args.min_inst_slice: learner_args['min_instances_slice'] = args.min_inst_slice if args.alpha: learner_args['alpha'] = args.alpha if args.prior_weight: learner_args['prior_weight'] = args.prior_weight if args.leaf: learner_args['leaf'] = args.leaf if args.row_split: learner_args['row_split'] = args.row_split learner = MSPNLearner(learner_args) else: assert (False), "Unknown action" logger.info("Running {} on experiment {}".format(args.action, args.experiment_path)) run_experiment(experiment, learner, args.output_path, args.seed, args.n_samples, args.global_norm, timeout=args.renorm_timeout)
def generate_experiment(seed, n_problems, n_train, n_valid, n_reals, n_bools, depth, bias, k, literals, h, ratio, errors): logger.info("Generating experiment:\n" + "seed: {}\n".format(seed) + "n_problems: {}\n".format(n_problems) + "n_train: {}\n".format(n_train) + "n_valid: {}\n".format(n_valid) + "n_reals: {}\n".format(n_reals) + "n_bools: {}\n".format(n_bools) + "bias: {}\n".format(bias) + "k: {}\n".format(k) + "literals: {}\n".format(literals) + "h: {}\n".format(h) + "ratio: {}\n".format(ratio) + "errors: {}\n".format(errors)) model_generator = ModelGenerator(n_reals, n_bools, seed, templ_bools="b{}", templ_reals="r{}", initial_bounds=[0, 1]) problems = [] while len(problems) < n_problems: try: # generating the ground truth model # not complex enough #chi = model_generator.generate_support_tree(depth) sample_count = 1000 chi = support_generator(1, n_bools, n_reals, bias, k, literals, h, sample_count, ratio, errors, seed)[0] w = model_generator.generate_weights_tree(depth, nonnegative=True, splits_only=True) boolean_vars = list(set(v for v in chi.get_free_variables() if v.symbol_type() == BOOL).union( set(model_generator.bools))) real_vars = list(set(v for v in chi.get_free_variables() if v.symbol_type() == REAL).union( set(model_generator.reals))) bounds = {v.symbol_name() : model_generator.initial_bounds for v in real_vars} fbounds = And([And(LE(Real(bounds[var.symbol_name()][0]), var), LE(var, Real(bounds[var.symbol_name()][1]))) for var in real_vars]) model = Model(And(fbounds, chi), w, boolean_vars + real_vars, bounds) # use exact inference to normalize the ground truth sample_count = None normalize(model, seed, sample_count, engine='pa') logger.debug("model generator reals: {}".format(model_generator.reals)) logger.debug("model generator IDs: {}".format(list(map(id, model_generator.reals)))) logger.debug("model reals: {}".format(model.continuous_vars)) logger.debug("model IDs: {}".format(list(map(id, model.continuous_vars)))) # sampling the dataset from the ground truth model datasets = {} datasets['train'] = sample_dataset(model, n_train) datasets['valid'] = sample_dataset(model, n_valid) except ModelException as e: logger.debug(e.msg) continue logger.debug("Model {}\n".format(len(problems)+1) + "chi: {}\n".format(serialize(model.support)) + "w: {}\n".format(serialize(model.weightfun))) problem = Problem(model, datasets, bounds=bounds) problems.append(problem) # better safe than sorry? metadata = {'n_reals' : n_reals, 'n_bools' : n_bools, 'depth' : depth, 'n_train' : n_train, 'n_valid' : n_valid, 'seed' : seed} return Experiment(problems, metadata=metadata)
parser.add_argument("-t", "--timeout", type=int, help="Timeout") parser.add_argument( "--negative-bootstrap", type=int, help="How many negative samples use to bootstrap INCAL+ (def: 0)", default=0) args = parser.parse_args() seed = args.seed n_samples = args.n_samples use_boolean_knowledge = True timeout = args.timeout if args.timeout else None logger.info("Running volume computations on {}".format( args.experiment_path)) experiment = Experiment.read(args.experiment_path) results = [] for i, problem in enumerate(experiment.problems): results.append([]) logger.info("====================") logger.info("Problem {}".format(i)) assert (problem.original_path is not None) train = problem.datasets['train'] valid = problem.datasets['valid'] #debug valid.data = valid.data[:2]
def learn_support(dataset, seed, threshold_mult, timeout=None, bg_knowledge=None, symmetry_breaking=None, negative_bootstrap=None): logger.info(f"Running INCAL+. Symmetry breaking = {symmetry_breaking} negative_bootstrap = {negative_bootstrap}") # default might become symmetry_breaking = "mvn" if symmetry_breaking is None: symmetry_breaking = "" if negative_bootstrap is None: negative_bootstrap = 0 else: try: # absolute count is specified with an integer negative_bootstrap = int(negative_bootstrap) except ValueError: pass # relative count (wrt |D|) is specified with a float negative_bootstrap = int(len(dataset) * float(negative_bootstrap)) # compute bounds and add positive labels to the data bounds = {} for row in dataset.data: for i, feat in enumerate(dataset.features): var = feat[0] if var.symbol_type() == BOOL: continue varname = var.symbol_name() if not varname in bounds: bounds[varname] = [row[i], row[i]] else: if row[i] < bounds[varname][0]: bounds[varname][0] = row[i] elif row[i] > bounds[varname][1]: bounds[varname][1] = row[i] data = np.array(dataset.data) labels = np.ones(data.shape[0]) # create a Domain instance varnames = [] vartypes = {} for v, _ in dataset.features: varnames.append(v.symbol_name()) vartypes[v.symbol_name()] = v.symbol_type() domain = Domain(varnames, vartypes, bounds) distance = Distance(domain, Distance.l_inf) max_closest = None for i1 in range(len(data)): min_distance = None for i2 in range(0, len(data)): if i1 != i2: p1, p2 = dataset.data[i1], dataset.data[i2] d = distance.between(p1, p2) min_distance = d if min_distance is None else min(min_distance, d) if min_distance < 1: max_closest = min_distance if max_closest is None else max(max_closest, min_distance) logger.debug("Maximum distance between closest neighbors: {}".format(max_closest)) threshold = threshold_mult * max_closest logger.debug("Overall threshold: {}".format(threshold)) thresholds = {r: threshold * domain.domain_size(r) for r in domain.real_vars} logger.debug("Thresholds per dimension: {}".format(thresholds)) def learn_inc(_data, _labels, _i, _k, _h): strategy = OneClassStrategy(RandomViolationsStrategy(10), thresholds, background_knowledge=bg_knowledge) if negative_bootstrap > 0: _data, _labels = OneClassStrategy.add_negatives(domain, _data, _labels, thresholds, negative_bootstrap) learner = KCnfSmtLearner(_k, _h, strategy, symmetry_breaking) random.seed(seed) initial_indices = LearnOptions.initial_random(20)(list(range(len(_data)))) res = learner.learn(domain, _data, _labels, initial_indices) return res # wrapping INCAL+ into a timed process def learn_wrap(data, labels, learn_inc, queue): res = learn_bottom_up(data, labels, learn_inc, 1, 1, 1, 1, None, None) (new_data, new_labels, formula), k, h = res msg = "Learned CNF(k={}, h={})" logger.debug(msg.format(k, h)) msg = "Data-set grew from {} to {} entries" logger.debug(msg.format(len(labels), len(new_labels))) queue.put((formula, k, h)) queue = Queue() timed_proc = Process(target=learn_wrap, args=(data, labels, learn_inc, queue)) timed_proc.start() timed_proc.join(timeout) if timed_proc.is_alive(): # timed process didn't complete the job timed_proc.terminate() timed_proc.join() return None else: # get the learned formula, (k,h) chi, k, h = queue.get() return chi, k, h, thresholds
help="Seed number") parser.add_argument("-t", "--timeout", type=int, help="Timeout") helpneg = """Use negative bootstrap INCAL+ (def: 1.0), - an integer inticates absolute number of samples - a float indicates the ratio wrt the training set size""" parser.add_argument("--negative-bootstrap", type=str, help=helpneg, default="1.0") args = parser.parse_args() timeout = args.timeout if args.timeout else None logger.info("Running support learning on {}".format(args.experiment_path)) logger.info("Timeout: {}".format(timeout)) logger.info("N. negatives: {}".format(args.negative_bootstrap)) experiment = Experiment.read(args.experiment_path) for i, problem in enumerate(experiment.problems): logger.info("====================") logger.info("Problem {}".format(i)) assert (problem.original_path is not None) if len(problem.learned_supports) > 0: msg = "Found {} supports. Skipping." logger.info(msg.format(len(problem.learned_supports))) continue