示例#1
0
def learn_supports_adaptive(dataset, seed, bg_knowledge=None, timeout=None, initial=None, mult=None,
                            hops=None, max_mult=None, negative_bootstrap=None):

    if timeout is None:
        timeout = DEF_TIMEOUT

    if initial is  None:
        initial = DEF_INITIAL

    if mult is None:
        mult = DEF_MULT

    if hops is None:
        hops = DEF_HOPS

    if max_mult is None:
        max_mult = DEF_MAX_MULT

    results = []
    discovered = set()
    t_mults = set()
    
    last = initial
    i = 0

    msg = "Adaptive support learning. timeout = {}, init = {}, mult = {}, hops = {}"
    logger.info(msg.format(timeout, initial, mult, hops))
    while i < hops and last < max_mult:
        logger.debug("i: {} last: {}".format(i, last))
        t_mults.add(last)
        res = learn_support(dataset, seed, last, timeout=timeout, bg_knowledge=bg_knowledge,
                            symmetry_breaking="mvn",
                            negative_bootstrap=negative_bootstrap)
        
        if res is not None:
            chi, k, h, thresholds = res
            chistr = serialize(chi)            
            smaller = {t for t in t_mults if t < last}
            
            if chistr not in discovered:
                discovered.add(chistr)
                results.append(res + (last,))

            if len(smaller) > 0:
                last = (last + max(smaller)) / 2
                i += 1
            else:
                last = last / mult

        else: # last t_mult timed out
            larger = {t for t in t_mults if t > last}
            if len(larger) > 0:
                last = (last + min(larger)) / 2
                i += 1
            else:
                last = last * mult

    return results
示例#2
0
def run_experiment(experiment,
                   learner,
                   output_path,
                   seed,
                   n_samples,
                   global_norm,
                   timeout=None,
                   discard_missing=True):

    if timeout is None:
        timeout = DEF_TIMEOUT

    results = []
    n_discarded = 0
    for i, problem in enumerate(experiment.problems):

        learned_models, evaluation = run_problem(problem, learner, seed,
                                                 n_samples, timeout,
                                                 global_norm)

        missing_data = (evaluation['gt-renorm'] == {}
                        or evaluation['None'] == {}
                        or evaluation['best'] is None)

        if discard_missing and missing_data:
            continue

        if learned_models is not None:
            j = 0
            for t_mult, learned_model in learned_models:
                output_name = basename(output_path)
                folder = abspath(dirname(output_path))
                model_name = output_name + "_{}_learned_{}".format(i, j)
                model_path = join(folder, model_name)
                learned_model.dump(model_path)
                evaluation[t_mult]['model_path'] = model_path
                j += 1

        results.append(evaluation)

    n_dis = len(experiment.problems) - len(results)
    n_tot = len(experiment.problems)
    logger.info("Experiment done. Discarded {}/{}.".format(n_dis, n_tot))

    if len(results) > 0:
        with open(output_path, 'wb') as f:
            pickle.dump(results, f)
    else:
        logger.warning("Nothing to dump!")
    def estimate_density(self, training_data, validation_data=None):
        """Fit a DET on the training data. If the optional validation data is
        provided, it is used to prune the tree."""

        self.det = DET(**{
            k: v
            for k, v in self.learner_args.items() if k in ['n_min', 'n_max']
        })
        logger.info("Growing the full tree")
        self.det.grow_full_tree(training_data)
        logger.info(self.det.info())
        logger.info("Pruning the full tree")
        if validation_data is None:
            if 'n_bins' in self.learner_args:
                self.det.prune_with_cv(training_data,
                                       n_bins=self.learner_args['n_bins'])
            else:
                self.det.prune_with_cv(training_data)
        else:
            self.det.prune_with_validation(validation_data)

        logger.info(self.det.info())
示例#4
0
def run_problem(problem,
                learner,
                seed,
                n_samples,
                timeout,
                global_norm,
                use_lariat=True):

    ground_truth = problem.model
    evaluation = dict()

    train = problem.datasets['train']
    valid = problem.datasets['valid']

    train_valid = Dataset(train.features, train.data + valid.data,
                          train.constraints)

    if problem.learned_supports is not None:
        prior_supports = {
            problem.metadata['supports_metadata'][i]['support_threshold_mult']:
            chi
            for i, chi in enumerate(problem.learned_supports)
        }
    else:
        logger.warning("Couldn't find any learned support.")
        prior_supports = dict()

    prior_supports['None'] = None
    prior_supports['gt-renorm'] = ground_truth.support

    t_0 = time()
    learner.estimate_density(train, validation_data=valid)
    t_f = time() - t_0
    logger.info("training time: {}".format(t_f))
    evaluation['training_time'] = t_f

    learned_models = []
    cached_models = dict()
    max_ll = None
    best = None

    logger.info("Evaluating:\n {}".format("\n".join(
        map(str, prior_supports.keys()))))

    for t_mult, prior_support in prior_supports.items():

        if t_mult != 'None' and not use_lariat:
            continue

        evaluation[t_mult] = dict()
        ps_str = serialize(prior_support) if not isinstance(t_mult,
                                                            str) else t_mult

        if ps_str in cached_models:
            learned_model, evaluation[t_mult] = cached_models[ps_str]
        else:
            try:
                logger.info(
                    "--------------------------------------------------")
                logger.info("Support: {}".format(t_mult))

                mode = RENORM_FULL if prior_support is not None else RENORM_OFF
                t_0 = time()
                learned_model, renormd = learner.renormalize(
                    train,
                    seed,
                    mode=mode,
                    support=prior_support,
                    timeout=timeout,
                    global_norm=global_norm)
                t_f = time() - t_0
                if not renormd and prior_support is not None:
                    continue

                evaluation[t_mult]['renorm_time'] = t_f

            except CalledProcessError as e:
                logger.warning("XADD error: {}".format(e))
                continue

            except ModelException as e:
                logger.warning("Model error: {}".format(e))
                continue

            logger.debug("Computing approx-IAE")
            iae = approx_IAE(learned_model, ground_truth, seed, n_samples)
            evaluation[t_mult]['approx-iae'] = iae

            logger.debug("Computing train-LL")
            train_ll, train_out = learned_model.log_likelihood(train)
            evaluation[t_mult]['train-ll'] = train_ll
            evaluation[t_mult]['train-out'] = train_out
            logger.debug("Computing valid-LL")
            valid_ll, valid_out = learned_model.log_likelihood(valid)
            evaluation[t_mult]['valid-ll'] = valid_ll
            evaluation[t_mult]['valid-out'] = valid_out
            train_valid_ll, train_valid_out = learned_model.log_likelihood(
                train_valid)
            evaluation[t_mult]['train-valid-ll'] = train_valid_ll
            evaluation[t_mult]['train-valid-out'] = train_valid_out

            if t_mult not in ['None','gt-renorm'] \
               and (max_ll is None or valid_ll > max_ll):
                max_ll = valid_ll
                best = t_mult

            logger.debug("Computing volume difference")
            poly1 = Model(learned_model.support, None, ground_truth.get_vars(),
                          ground_truth.bounds)
            poly2 = Model(ground_truth.support, None, ground_truth.get_vars(),
                          ground_truth.bounds)
            vol_diff = ISE(poly1, poly2, seed, n_samples, engine='rej')

            evaluation[t_mult]['vol-diff'] = vol_diff

            cached_models[ps_str] = (learned_model, evaluation[t_mult])

            domain = Domain.make(
                map(lambda v: v.symbol_name(), ground_truth.boolean_vars),
                learned_model.bounds)
            eval_falses = evaluate(domain, learned_model.support,
                                   np.asarray(train.data))

        learned_models.append((t_mult, learned_model))

    evaluation['best'] = best

    tmuls = sorted([
        key for key in evaluation
        if key not in ['None', 'gt-renorm', 'training_time', 'best']
    ])

    eval_msg = """RESULTS:
Training time: {}
No renorm: {}
GT renorm: {}
Best chi : {}

All chis:
{}
""".format(evaluation['training_time'], evaluation['None'],
           evaluation['gt-renorm'], (best, evaluation.get(best)),
           "\n".join([str((tmul, evaluation[tmul])) for tmul in tmuls]))

    logger.info(eval_msg)

    return learned_models, evaluation
示例#5
0
            learner_args['n_bins'] = args.n_bins

        learner = DETLearner(learner_args)

    elif args.action == 'mspn':
        if args.min_inst_slice:
            learner_args['min_instances_slice'] = args.min_inst_slice
        if args.alpha:
            learner_args['alpha'] = args.alpha
        if args.prior_weight:
            learner_args['prior_weight'] = args.prior_weight
        if args.leaf:
            learner_args['leaf'] = args.leaf
        if args.row_split:
            learner_args['row_split'] = args.row_split

        learner = MSPNLearner(learner_args)

    else:
        assert (False), "Unknown action"

    logger.info("Running {} on  experiment {}".format(args.action,
                                                      args.experiment_path))
    run_experiment(experiment,
                   learner,
                   args.output_path,
                   args.seed,
                   args.n_samples,
                   args.global_norm,
                   timeout=args.renorm_timeout)
def generate_experiment(seed, n_problems, n_train, n_valid, n_reals, n_bools,
                        depth, bias, k, literals, h, ratio, errors):

    logger.info("Generating experiment:\n" +
                "seed: {}\n".format(seed) +
                "n_problems: {}\n".format(n_problems) +
                "n_train: {}\n".format(n_train) +
                "n_valid: {}\n".format(n_valid) +
                "n_reals: {}\n".format(n_reals) +
                "n_bools: {}\n".format(n_bools) +
                "bias: {}\n".format(bias) +
                "k: {}\n".format(k) +
                "literals: {}\n".format(literals) +
                "h: {}\n".format(h) +
                "ratio: {}\n".format(ratio) +
                "errors: {}\n".format(errors))
                
    model_generator = ModelGenerator(n_reals, n_bools, seed,
                                     templ_bools="b{}",
                                     templ_reals="r{}",
                                     initial_bounds=[0, 1])

    problems = []
    while len(problems) < n_problems:
        try:
            # generating the ground truth model
            # not complex enough
            #chi = model_generator.generate_support_tree(depth)
            sample_count = 1000
            chi = support_generator(1, n_bools, n_reals, bias, k, literals, h,
                                    sample_count, ratio, errors, seed)[0]

            w = model_generator.generate_weights_tree(depth, nonnegative=True,
                                                      splits_only=True)

            boolean_vars = list(set(v for v in chi.get_free_variables()
                                    if v.symbol_type() == BOOL).union(
                                            set(model_generator.bools)))
            
            real_vars = list(set(v for v in chi.get_free_variables()
                                    if v.symbol_type() == REAL).union(
                                            set(model_generator.reals)))
            
            bounds = {v.symbol_name() : model_generator.initial_bounds
                      for v in real_vars}

            fbounds = And([And(LE(Real(bounds[var.symbol_name()][0]), var),
                               LE(var, Real(bounds[var.symbol_name()][1])))
                           for var in real_vars])
            model = Model(And(fbounds, chi), w, boolean_vars + real_vars, bounds)

            # use exact inference to normalize the ground truth
            sample_count = None
            normalize(model, seed, sample_count, engine='pa')

            logger.debug("model generator reals: {}".format(model_generator.reals))
            logger.debug("model generator IDs: {}".format(list(map(id, model_generator.reals))))

            logger.debug("model reals: {}".format(model.continuous_vars))
            logger.debug("model IDs: {}".format(list(map(id, model.continuous_vars))))

            # sampling the dataset from the ground truth model
            datasets = {}
            datasets['train'] = sample_dataset(model, n_train)
            datasets['valid'] = sample_dataset(model, n_valid)

        except ModelException as e:
            logger.debug(e.msg)
            continue
        
        logger.debug("Model {}\n".format(len(problems)+1) +
                     "chi: {}\n".format(serialize(model.support)) +
                     "w: {}\n".format(serialize(model.weightfun)))

        problem = Problem(model,
                          datasets,
                          bounds=bounds)

        problems.append(problem)

    # better safe than sorry?
    metadata = {'n_reals' : n_reals, 'n_bools' : n_bools, 'depth' : depth,
                'n_train' : n_train, 'n_valid' : n_valid, 'seed' : seed}
        

    return Experiment(problems, metadata=metadata)
示例#7
0
    parser.add_argument("-t", "--timeout", type=int, help="Timeout")

    parser.add_argument(
        "--negative-bootstrap",
        type=int,
        help="How many negative samples use to bootstrap INCAL+ (def: 0)",
        default=0)

    args = parser.parse_args()
    seed = args.seed
    n_samples = args.n_samples

    use_boolean_knowledge = True
    timeout = args.timeout if args.timeout else None

    logger.info("Running volume computations on {}".format(
        args.experiment_path))

    experiment = Experiment.read(args.experiment_path)

    results = []
    for i, problem in enumerate(experiment.problems):
        results.append([])
        logger.info("====================")
        logger.info("Problem {}".format(i))
        assert (problem.original_path is not None)

        train = problem.datasets['train']
        valid = problem.datasets['valid']
        #debug
        valid.data = valid.data[:2]
示例#8
0
def learn_support(dataset, seed, threshold_mult, timeout=None, bg_knowledge=None,
                  symmetry_breaking=None,
                  negative_bootstrap=None):

    logger.info(f"Running INCAL+. Symmetry breaking = {symmetry_breaking} negative_bootstrap = {negative_bootstrap}")    

    # default might become symmetry_breaking = "mvn"
    if symmetry_breaking is None:
        symmetry_breaking = ""

    if negative_bootstrap is None:
        negative_bootstrap = 0
    else:
        try:
            # absolute count is specified with an integer
            negative_bootstrap = int(negative_bootstrap)
        except ValueError:
            pass

        # relative count (wrt |D|) is specified with a float
        negative_bootstrap = int(len(dataset) * float(negative_bootstrap))
            
    # compute bounds and add positive labels to the data
    bounds = {}
    for row in dataset.data:
        for i, feat in enumerate(dataset.features):
            var = feat[0]

            if var.symbol_type() == BOOL:
                continue

            varname = var.symbol_name()
            if not varname in bounds:
                bounds[varname] = [row[i], row[i]]
            else:
                if row[i] < bounds[varname][0]:
                    bounds[varname][0] = row[i]
                elif row[i] > bounds[varname][1]:
                    bounds[varname][1] = row[i]

    data = np.array(dataset.data)
    labels = np.ones(data.shape[0])

    # create a Domain instance
    varnames = []
    vartypes = {}
    for v, _ in dataset.features:
        varnames.append(v.symbol_name())
        vartypes[v.symbol_name()] = v.symbol_type()

    domain = Domain(varnames, vartypes, bounds)
    distance = Distance(domain, Distance.l_inf)

    max_closest = None
    for i1 in range(len(data)):
        min_distance = None
        for i2 in range(0, len(data)):
            if i1 != i2:
                p1, p2 = dataset.data[i1], dataset.data[i2]
                d = distance.between(p1, p2)
                min_distance = d if min_distance is None else min(min_distance, d)
        if min_distance < 1:
            max_closest = min_distance if max_closest is None else max(max_closest, min_distance)

    logger.debug("Maximum distance between closest neighbors: {}".format(max_closest))

    threshold = threshold_mult * max_closest
    logger.debug("Overall threshold: {}".format(threshold))

    thresholds = {r: threshold * domain.domain_size(r) for r in domain.real_vars}
    logger.debug("Thresholds per dimension: {}".format(thresholds))

    def learn_inc(_data, _labels, _i, _k, _h):
        strategy = OneClassStrategy(RandomViolationsStrategy(10), thresholds,
                                    background_knowledge=bg_knowledge)
        if negative_bootstrap > 0:
            _data, _labels = OneClassStrategy.add_negatives(domain, _data, _labels, thresholds, negative_bootstrap)

        learner = KCnfSmtLearner(_k, _h, strategy, symmetry_breaking)

        random.seed(seed)        
        initial_indices = LearnOptions.initial_random(20)(list(range(len(_data))))
        res = learner.learn(domain, _data, _labels, initial_indices)
        return res


    # wrapping INCAL+ into a timed process
    def learn_wrap(data, labels, learn_inc, queue):
        res = learn_bottom_up(data, labels, learn_inc, 1, 1, 1, 1, None, None)
        (new_data, new_labels, formula), k, h = res
        msg = "Learned CNF(k={}, h={})"
        logger.debug(msg.format(k, h))
        msg = "Data-set grew from {} to {} entries"
        logger.debug(msg.format(len(labels), len(new_labels)))
        
        queue.put((formula, k, h))

    queue = Queue()
    timed_proc = Process(target=learn_wrap, args=(data, labels, learn_inc, queue))
    timed_proc.start()
    timed_proc.join(timeout)
    if timed_proc.is_alive():
        # timed process didn't complete the job
        timed_proc.terminate()
        timed_proc.join()
        return None
    else:
        # get the learned formula, (k,h)
        chi, k, h = queue.get()
        return chi, k, h, thresholds
                        help="Seed number")

    parser.add_argument("-t", "--timeout", type=int, help="Timeout")

    helpneg = """Use negative bootstrap INCAL+ (def: 1.0),
- an integer inticates absolute number of samples
- a float indicates the ratio wrt the training set size"""
    parser.add_argument("--negative-bootstrap",
                        type=str,
                        help=helpneg,
                        default="1.0")

    args = parser.parse_args()
    timeout = args.timeout if args.timeout else None

    logger.info("Running support learning on {}".format(args.experiment_path))
    logger.info("Timeout: {}".format(timeout))
    logger.info("N. negatives: {}".format(args.negative_bootstrap))

    experiment = Experiment.read(args.experiment_path)

    for i, problem in enumerate(experiment.problems):
        logger.info("====================")
        logger.info("Problem {}".format(i))
        assert (problem.original_path is not None)

        if len(problem.learned_supports) > 0:
            msg = "Found {} supports. Skipping."
            logger.info(msg.format(len(problem.learned_supports)))
            continue