예제 #1
0
    def dump(self, experiment_path):

        if exists(experiment_path):
            logger.warning("File exists: {}".format(experiment_path))

        experiment_name = basename(experiment_path)
        folder = abspath(dirname(experiment_path))

        problem_paths = []
        for i, prob in enumerate(self.problems):
            problem_filename = Experiment.PROBLEM_TEMPL.format(
                experiment_name, i)
            problem_path = join(folder, problem_filename)
            prob.dump(problem_path)
            problem_paths.append(relpath(problem_path, folder))

        index = {'problem_paths': problem_paths}

        if self.metadata is not None:
            index['metadata'] = self.metadata

        with open(experiment_path, 'wb') as f:
            pickle.dump(index, f)

        return experiment_path
예제 #2
0
    def read(path):
        with open(path, 'rb') as f:
            index = pickle.load(f)

        folder = abspath(dirname(path))
        model = Model.read(join(folder, index['model_path']))

        datasets = {}
        for dataset_name, rel_path in index['dataset_paths'].items():
            datasets[dataset_name] = Dataset.read(join(folder, rel_path))

        bounds = bounds = index['bounds'] if 'bounds' in index else None
        learned_supports = []
        metadata = None

        if 'support_paths' in index:
            for support_path in index['support_paths']:
                try:
                    learned_supports.append(
                        read_smtlib(join(folder, support_path)))
                except FileNotFoundError as e:
                    logger.warning("Couldn't read: {}".format(path))

        if 'metadata' in index:
            metadata = index['metadata']

        problem = Problem(model,
                          datasets,
                          bounds=bounds,
                          learned_supports=learned_supports,
                          metadata=metadata)

        problem.original_path = path
        return problem
예제 #3
0
    def dump(self, dataset_path):

        if exists(dataset_path):
            logger.warning("File exists: {}".format(dataset_path))

        dataset_name = basename(dataset_path)

        feats_filename = Dataset.FEATS_TEMPL.format(dataset_name)
        data_filename = Dataset.DATA_TEMPL.format(dataset_name)
        constr_filename = Dataset.CONSTRAINTS_TEMPL.format(dataset_name)

        folder = abspath(dirname(dataset_path))

        feats_path = join(folder, feats_filename)
        data_path = join(folder, data_filename)
        constr_path = join(folder, constr_filename)

        self.dump_feats(feats_path)
        self.dump_data(data_path)

        index = {
            'feats_path': relpath(feats_path, folder),
            'data_path': relpath(data_path, folder)
        }

        if self.constraints is not None:
            write_smtlib(self.constraints, constr_path)
            index['constr_path'] = relpath(constr_path, folder)

        with open(dataset_path, 'wb') as f:
            pickle.dump(index, f)
예제 #4
0
    def dump_data(self, data_path):
        if exists(data_path):
            logger.warning("File exists: {}".format(data_path))

        with open(data_path, 'w') as f:
            for row in self.data:
                str_row = ",".join(map(str, row))\
                            .replace("True","1")\
                            .replace("False","0") + "\n"
                f.write(str_row)
예제 #5
0
def run_experiment(experiment,
                   learner,
                   output_path,
                   seed,
                   n_samples,
                   global_norm,
                   timeout=None,
                   discard_missing=True):

    if timeout is None:
        timeout = DEF_TIMEOUT

    results = []
    n_discarded = 0
    for i, problem in enumerate(experiment.problems):

        learned_models, evaluation = run_problem(problem, learner, seed,
                                                 n_samples, timeout,
                                                 global_norm)

        missing_data = (evaluation['gt-renorm'] == {}
                        or evaluation['None'] == {}
                        or evaluation['best'] is None)

        if discard_missing and missing_data:
            continue

        if learned_models is not None:
            j = 0
            for t_mult, learned_model in learned_models:
                output_name = basename(output_path)
                folder = abspath(dirname(output_path))
                model_name = output_name + "_{}_learned_{}".format(i, j)
                model_path = join(folder, model_name)
                learned_model.dump(model_path)
                evaluation[t_mult]['model_path'] = model_path
                j += 1

        results.append(evaluation)

    n_dis = len(experiment.problems) - len(results)
    n_tot = len(experiment.problems)
    logger.info("Experiment done. Discarded {}/{}.".format(n_dis, n_tot))

    if len(results) > 0:
        with open(output_path, 'wb') as f:
            pickle.dump(results, f)
    else:
        logger.warning("Nothing to dump!")
예제 #6
0
    def dump_feats(self, feats_path):
        if exists(feats_path):
            logger.warning("File exists: {}".format(feats_path))

        with open(feats_path, 'w') as f:
            for feature, str_type in self.features:
                # TODO?: MSPNs dataset have a list of continuous values that is not
                # written here
                if feature.symbol_type() == REAL:
                    assert (str_type in ['continuous', 'discrete'])
                    f.write("{}:{}.\n".format(feature.symbol_name(), str_type))
                else:
                    assert (str_type == 'categorical')
                    f.write("{}:categorical:0,1.\n".format(
                        feature.symbol_name()))
def support_generator(how_many, b_count, r_count, bias, k, lits, h, sample_count,
                      ratio_percent, error_percent, seed):
    prefix = "random_support"
    ratio, errors = ratio_percent / 100, error_percent / 100
    producer = Generator(b_count, r_count, bias, k, lits, h, sample_count, ratio,
                         seed, prefix)
    supports = []
    while len(supports) < how_many:
        try:
            chi = producer.generate_formula().support
        except RuntimeError:
            logger.warning("Runtime error while sampling the support")
            continue

        supports.append(chi)

    return supports
예제 #8
0
    def dump(self, model_path):

        if exists(model_path):
            logger.warning("File exists: {}".format(model_path))

        model_name = basename(model_path)

        support_filename = Model.SUPPORT_TEMPL.format(model_name)
        weightf_filename = Model.WEIGHTF_TEMPL.format(model_name)

        folder = abspath(dirname(model_path))

        support_path = join(folder, support_filename)
        weightf_path = join(folder, weightf_filename)

        paths = [support_path, weightf_path]
        if any(exists(f) for f in paths):
            logger.warning("File(s) exist:\n" + "\n".join(paths))

        write_smtlib(self.support, support_path)
        write_smtlib(self.weightfun, weightf_path)

        varlist = [(v.symbol_name(), v.symbol_type()) for v in self.get_vars()]

        index = {
            'support_path': relpath(support_path, folder),
            'weightf_path': relpath(weightf_path, folder),
            'variables': varlist,
            'bounds': self.bounds
        }

        if self.metadata is not None:
            index['metadata'] = self.metadata

        with open(model_path, 'wb') as f:
            pickle.dump(index, f)
예제 #9
0
def run_problem(problem,
                learner,
                seed,
                n_samples,
                timeout,
                global_norm,
                use_lariat=True):

    ground_truth = problem.model
    evaluation = dict()

    train = problem.datasets['train']
    valid = problem.datasets['valid']

    train_valid = Dataset(train.features, train.data + valid.data,
                          train.constraints)

    if problem.learned_supports is not None:
        prior_supports = {
            problem.metadata['supports_metadata'][i]['support_threshold_mult']:
            chi
            for i, chi in enumerate(problem.learned_supports)
        }
    else:
        logger.warning("Couldn't find any learned support.")
        prior_supports = dict()

    prior_supports['None'] = None
    prior_supports['gt-renorm'] = ground_truth.support

    t_0 = time()
    learner.estimate_density(train, validation_data=valid)
    t_f = time() - t_0
    logger.info("training time: {}".format(t_f))
    evaluation['training_time'] = t_f

    learned_models = []
    cached_models = dict()
    max_ll = None
    best = None

    logger.info("Evaluating:\n {}".format("\n".join(
        map(str, prior_supports.keys()))))

    for t_mult, prior_support in prior_supports.items():

        if t_mult != 'None' and not use_lariat:
            continue

        evaluation[t_mult] = dict()
        ps_str = serialize(prior_support) if not isinstance(t_mult,
                                                            str) else t_mult

        if ps_str in cached_models:
            learned_model, evaluation[t_mult] = cached_models[ps_str]
        else:
            try:
                logger.info(
                    "--------------------------------------------------")
                logger.info("Support: {}".format(t_mult))

                mode = RENORM_FULL if prior_support is not None else RENORM_OFF
                t_0 = time()
                learned_model, renormd = learner.renormalize(
                    train,
                    seed,
                    mode=mode,
                    support=prior_support,
                    timeout=timeout,
                    global_norm=global_norm)
                t_f = time() - t_0
                if not renormd and prior_support is not None:
                    continue

                evaluation[t_mult]['renorm_time'] = t_f

            except CalledProcessError as e:
                logger.warning("XADD error: {}".format(e))
                continue

            except ModelException as e:
                logger.warning("Model error: {}".format(e))
                continue

            logger.debug("Computing approx-IAE")
            iae = approx_IAE(learned_model, ground_truth, seed, n_samples)
            evaluation[t_mult]['approx-iae'] = iae

            logger.debug("Computing train-LL")
            train_ll, train_out = learned_model.log_likelihood(train)
            evaluation[t_mult]['train-ll'] = train_ll
            evaluation[t_mult]['train-out'] = train_out
            logger.debug("Computing valid-LL")
            valid_ll, valid_out = learned_model.log_likelihood(valid)
            evaluation[t_mult]['valid-ll'] = valid_ll
            evaluation[t_mult]['valid-out'] = valid_out
            train_valid_ll, train_valid_out = learned_model.log_likelihood(
                train_valid)
            evaluation[t_mult]['train-valid-ll'] = train_valid_ll
            evaluation[t_mult]['train-valid-out'] = train_valid_out

            if t_mult not in ['None','gt-renorm'] \
               and (max_ll is None or valid_ll > max_ll):
                max_ll = valid_ll
                best = t_mult

            logger.debug("Computing volume difference")
            poly1 = Model(learned_model.support, None, ground_truth.get_vars(),
                          ground_truth.bounds)
            poly2 = Model(ground_truth.support, None, ground_truth.get_vars(),
                          ground_truth.bounds)
            vol_diff = ISE(poly1, poly2, seed, n_samples, engine='rej')

            evaluation[t_mult]['vol-diff'] = vol_diff

            cached_models[ps_str] = (learned_model, evaluation[t_mult])

            domain = Domain.make(
                map(lambda v: v.symbol_name(), ground_truth.boolean_vars),
                learned_model.bounds)
            eval_falses = evaluate(domain, learned_model.support,
                                   np.asarray(train.data))

        learned_models.append((t_mult, learned_model))

    evaluation['best'] = best

    tmuls = sorted([
        key for key in evaluation
        if key not in ['None', 'gt-renorm', 'training_time', 'best']
    ])

    eval_msg = """RESULTS:
Training time: {}
No renorm: {}
GT renorm: {}
Best chi : {}

All chis:
{}
""".format(evaluation['training_time'], evaluation['None'],
           evaluation['gt-renorm'], (best, evaluation.get(best)),
           "\n".join([str((tmul, evaluation[tmul])) for tmul in tmuls]))

    logger.info(eval_msg)

    return learned_models, evaluation
예제 #10
0
    mspn_parser.add_argument("--alpha", type=float, help="alpha?")
    mspn_parser.add_argument("--prior-weight",
                             type=float,
                             help="prior weight?")
    mspn_parser.add_argument("--leaf",
                             choices=['piecewise', 'isotonic'],
                             help="leaf?")
    mspn_parser.add_argument("--row-split",
                             choices=['rdc-kmeans', 'gower'],
                             help="row split?")

    args = parser.parse_args()

    # better check this first
    if exists(args.output_path):
        logger.warning("File exists: {}".format(args.output_path))

    experiment = Experiment.read(args.experiment_path)

    learner_args = {}

    if args.seed:
        learner_args['seed'] = args.seed

    if args.action == 'det':

        if args.n_min:
            learner_args['n_min'] = args.n_min
        if args.n_max:
            learner_args['n_max'] = args.n_max
        if args.n_bins:
예제 #11
0
    def renormalize(self,
                    training_data,
                    seed,
                    mode=RENORM_OFF,
                    support=None,
                    timeout=None,
                    global_norm=False):

        if timeout is None:
            timeout = DEF_RENORM_TIMEOUT

        detcopy = self.det.copy()

        model_support = detcopy.tree_to_WMI_support()
        model_weight = detcopy.tree_to_WMI_weightfun()

        bounds = {
            v.symbol_name(): b
            for v, b in detcopy.root.bounds.items() if v.symbol_type() == REAL
        }

        renorm_support = None
        if mode == RENORM_BG_ONLY and training_data.constraints is not None:
            renorm_support = training_data.constraints
        elif mode == RENORM_FULL:
            if training_data.constraints is not None and support is not None:
                renorm_support = training_data.constraints & support
            elif training_data.constraints is not None:
                renorm_support = training_data.constraints
            elif support is not None:
                renorm_support = support

        renormalized = False
        if renorm_support is not None:

            if global_norm:
                logger.debug("Global renormalization")
                model_support = model_support & renorm_support
                renormalized = True
            else:
                logger.debug("Local renormalization")

                def renorm_wrap(inst, support, support_path, weight_path):
                    try:
                        inst.renormalize(support)
                        support = inst.tree_to_WMI_support()
                        weight = inst.tree_to_WMI_weightfun()
                        msg = "Writing result to files:\n{}\n{}"
                        logger.debug(msg.format(support_path, weight_path))
                        write_smtlib(support, support_path)
                        write_smtlib(weight, weight_path)
                        logger.debug("Done.")

                    except ModelException as e:
                        logger.error(
                            "Couldn't renormalize the DET: {}".format(e))

                # communication with wrapper process through file
                # NEVER use multiprocessing.Queue with huge pysmt formulas
                rndstr = ''.join(choice(TMP_CHARS) for _ in range(TMP_LEN))
                support_path = "{}.support".format(rndstr)
                weight_path = "{}.weight".format(rndstr)
                timed_proc = Process(target=renorm_wrap,
                                     args=(detcopy, renorm_support,
                                           support_path, weight_path))

                logger.debug(
                    "Starting renormalization with timeout: {}".format(
                        timeout))
                timed_proc.start()
                logger.debug("Timed proc started")
                timed_proc.join(timeout)
                logger.debug("Timed proc joined")

                if timed_proc.is_alive():
                    logger.warning("Renormalization timed out")
                    pid = timed_proc.pid
                    logger.warning(
                        "Killing process {} and its children".format(pid))
                    kill_recursive(pid)

                else:
                    try:
                        model_support = read_smtlib(support_path)
                        remove(support_path)
                    except FileNotFoundError:
                        model_support = None
                    try:
                        model_weight = read_smtlib(weight_path)
                        remove(weight_path)
                    except FileNotFoundError:
                        model_weight = None

                    if model_support is None or model_weight is None:
                        raise ModelException("Couldn't renormalize the DET")

                    logger.debug("Renormalization done")
                    renormalized = True

        model = Model(model_support,
                      model_weight,
                      list(map(lambda x: x[0], training_data.features)),
                      bounds,
                      metadata=self.learner_args)

        # is Z = 1?
        if renormalized:
            check_Z_normalize(model, seed, TEST_AND_NORM_SAMPLES)

        elif not global_norm:
            # fallback strategy for local: to global
            model, renormalized = self.renormalize(training_data,
                                                   seed,
                                                   mode=mode,
                                                   support=support,
                                                   timeout=timeout,
                                                   global_norm=True)

        return model, renormalized
예제 #12
0
    def renormalize(self,
                    training_data,
                    seed,
                    mode=RENORM_OFF,
                    support=None,
                    timeout=None,
                    global_norm=True):

        if timeout is None:
            timeout = DEF_RENORM_TIMEOUT

        feature_dict = {
            var.symbol_name(): var
            for var, _ in training_data.features
        }

        model_weightfun, model_support = SPN_to_WMI(self.spn.root,
                                                    feature_dict)

        bounds = {}
        for i, feat in enumerate(training_data.features):
            var = feat[0]
            if var.symbol_type() == REAL:
                xi = list(map(lambda row: row[i], training_data.data))
                bounds[var.symbol_name()] = [min(xi), max(xi)]

        renorm_support = None
        if mode == RENORM_BG_ONLY and training_data.constraints is not None:
            renorm_support = training_data.constraints
        elif mode == RENORM_FULL:
            if training_data.constraints is not None and support is not None:
                renorm_support = training_data.constraints & support
            elif training_data.constraints is not None:
                renorm_support = training_data.constraints
            elif support is not None:
                renorm_support = support

        renormalized = False
        if renorm_support is not None:
            if global_norm:
                logger.debug("Global renormalization")
                model_support = model_support & renorm_support
                renormalized = True

            else:
                logger.debug("Local renormalization")
                domain = Domain.make([
                    v.symbol_name() for v, _ in training_data.features
                    if v.symbol_type() == BOOL
                ], bounds)

                nc_model_support = normalize_formula(model_support)
                nc_model_weightfun = normalize_formula(model_weightfun)
                nc_renorm_support = normalize_formula(renorm_support)

                t_0 = time()
                xaddsolver = XaddEngine(domain,
                                        nc_model_support,
                                        nc_model_weightfun,
                                        mode="original",
                                        timeout=timeout)

                t_init = time() - t_0
                logger.debug("XADDEngine t_init: {}".format(t_init))
                try:
                    res = xaddsolver.normalize(renorm_support)
                    t_norm = time() - t_init
                except CalledProcessError as e:
                    raise ModelException("CalledProcessError")

                if res is None:
                    logger.warning("Timeout.")
                else:
                    logger.debug("XADDEngine t_norm: {}".format(t_norm))
                    model_weightfun = get_env().formula_manager.normalize(res)
                    model_support = get_env().formula_manager.normalize(
                        And(model_support, renorm_support))
                    renormalized = True

        model = Model(model_support,
                      model_weightfun,
                      list(map(lambda x: x[0], training_data.features)),
                      bounds,
                      metadata=self.learner_args)

        if renormalized:
            check_Z_normalize(model, seed, TEST_AND_NORM_SAMPLES)

        elif not global_norm:
            # fallback strategy for local: to global
            model, renormalized = self.renormalize(training_data,
                                                   seed,
                                                   mode=mode,
                                                   support=support,
                                                   timeout=timeout,
                                                   global_norm=True)

        return model, renormalized
예제 #13
0
from string import ascii_uppercase, digits
from subprocess import CalledProcessError
from time import time
from wmilearn import logger
from wmilearn.exceptions import ModelException
from wmilearn.model import Model
from wmilearn.det import DET
from wmilearn.utils import check_Z_normalize

MSPN_import_err = None
try:
    from tfspn.SPN import SPN, Splitting
    from wmilearn.conversions import SPN_to_WMI

except ImportError as e:
    logger.warning("Couldn't import the MSPN library: " + str(e))
    MSPN_import_err = e


def kill_recursive(pid):
    proc = psutil.Process(pid)
    for subproc in proc.children(recursive=True):
        try:
            subproc.kill()
        except psutil.NoSuchProcess:
            continue
    try:
        proc.kill()
    except psutil.NoSuchProcess:
        pass
            chi, k, h, thresholds, threshold_mult = res
            learned_supports.append(chi)

            metadata = dict()
            metadata['support_k'] = k
            metadata['support_h'] = h
            metadata['support_seed'] = args.seed
            metadata['support_thresholds'] = thresholds
            metadata['support_threshold_mult'] = threshold_mult

            supports_metadata.append(metadata)

        if len(learned_supports) == 0:
            # try projecting on the continuous subspace
            logger.warning(
                "No support learned on the full space. Projecting..")
            numerical_vars = [
                v for v, s in train_valid.features
                if s in ["continuous", "discrete"]
            ]
            projected_train_valid = train_valid.project(numerical_vars)

            for res in learn_supports_adaptive(
                    projected_train_valid,
                    args.seed,
                    timeout=timeout,
                    bg_knowledge=train.constraints,
                    negative_bootstrap=args.negative_bootstrap):

                chi, k, h, thresholds, threshold_mult = res
                learned_supports.append(chi)