예제 #1
0
def plot_perm_diffs(samps, actual=None, bka=2, bkb=1, subplt=1, xlabel=None):
    t = ('Simulated and actual difference between books {bka} and {bkb}'
         '\nPermutation pvalue: {pv:.3%}; N={N:,.0f}'
          .format(bka=bka, bkb=bkb, pv=ut.pvalue(actual, samps), N=len(samps)))
    plt.subplot(1, 2, subplt, title=t)
    samps.hist(bins=50)
    plt.vlines(actual, *plt.ylim())
    plt.legend(['Actual\ndifference'], loc=2)
    if xlabel is not None:
        plt.xlabel(xlabel)
예제 #2
0
def get_trueFIs(exp_res_filename, eval_res_filename, min_freq, delta, pvalue_mode, first_epsilon=1.0):
    """ Compute the True Frequent Itemsets using the 'holdout-VC' method with
    the binomial test

    TODO Add more details."""

    stats = dict()

    with open(exp_res_filename) as FILE:
        size_line = FILE.readline()
        try:
            size_str = size_line.split("(")[1].split(")")[0]
        except IndexError:
            utils.error_exit("Cannot compute size of the explore dataset: '{}' is not in the recognized format\n".format(size_line))
        try:
            stats['exp_size'] = int(size_str)
        except ValueError:
            utils.error_exit("Cannot compute size of the explore dataset: {} is not a number\n".format(size_str))

    with open(eval_res_filename) as FILE:
        size_line = FILE.readline()
        try:
            size_str = size_line.split("(")[1].split(")")[0]
        except IndexError:
            utils.error_exit("Cannot compute size of the eval dataset: '{}' is not in the recognized format\n".format(size_line))
        try:
            stats['eval_size'] = int(size_str)
        except ValueError:
            utils.error_exit("Cannot compute size of the eval dataset: '{}' is not a number\n".format(size_str))

    stats['orig_size'] = stats['exp_size'] + stats['eval_size']

    exp_res = utils.create_results(exp_res_filename, min_freq)
    stats['exp_res'] = len(exp_res)
    exp_res_set = set(exp_res.keys())
    eval_res = utils.create_results(eval_res_filename, min_freq)
    stats['eval_res'] = len(eval_res)
    eval_res_set = set(eval_res.keys())
    intersection = exp_res_set & eval_res_set
    stats['holdout_intersection'] = len(intersection)
    stats['holdout_false_negatives'] = len(exp_res_set - eval_res_set)
    stats['holdout_false_positives'] = len(eval_res_set - exp_res_set)
    stats['holdout_jaccard'] = len(intersection) / len(exp_res_set | eval_res_set) 

    # One may want to play with giving different values for the different error
    # probabilities, but there isn't really much point in it.
    stats['lowered_delta'] = 1.0 - math.sqrt(1 - delta)

    stats['filter_epsilon'] = first_epsilon

    sys.stderr.write("Computing candidates...")
    sys.stderr.flush()
    freq_bound = min_freq + stats['filter_epsilon']
    exp_res_filtered = set()
    exp_res_filtered_items = set()
    trueFIs = dict()
    for itemset in exp_res:
        if exp_res[itemset] < freq_bound:
            exp_res_filtered.add(itemset)
            exp_res_filtered_items |= itemset
        else:
            # Add itemsets with frequency at last freq_bound to the TFIs
            trueFIs[itemset] = exp_res[itemset]
    sys.stderr.write("done: {} exp_res_filtered ({} items)\n".format(len(exp_res_filtered),
        len(exp_res_filtered_items)))
    sys.stderr.flush()
    stats['tfis_from_exp'] = len(trueFIs)
    stats['exp_res_filtered'] = len(exp_res_filtered)

    supposed_freq = (math.ceil( stats['orig_size'] * min_freq) - 1) / stats['orig_size']
    if stats['exp_res_filtered'] > 0:
        eval_res = utils.create_results(eval_res_filename, min_freq)
        eval_res_set = set(eval_res.keys())
        stats['eval_res'] = len(eval_res)

        intersection = exp_res_filtered & eval_res_set
        stats['holdout_intersection'] = len(intersection)
        stats['holdout_false_negatives'] = len(exp_res_filtered - eval_res_set)

        # Bonferroni correction (Union bound). We work in the log space.
        stats['critical_value'] = math.log(stats['lowered_delta']) - math.log(stats['exp_res_filtered'])

        # Add TFIs from eval
        last_accepted_freq = 1.0
        last_non_accepted_freq = min_freq
        for itemset in sorted(intersection, key=lambda x : eval_res[x], reverse=True):
            p_value = utils.pvalue(pvalue_mode, eval_res[itemset],
                    stats['eval_size'], supposed_freq)
            if p_value <= stats['critical_value']:
                trueFIs[itemset] = eval_res[itemset]
                last_accepted_freq = eval_res[itemset]
            else:
                last_non_accepted_freq = eval_res[itemset]
                break

        # Compute epsilon for the binomial
        min_diff = 5e-6 # controls when to stop the binary search
        while last_accepted_freq - last_non_accepted_freq > min_diff:
            mid_point = (last_accepted_freq - last_non_accepted_freq) / 2
            test_freq = last_non_accepted_freq + mid_point
            p_value = utils.pvalue(pvalue_mode, test_freq,
                    stats['eval_size'], supposed_freq)
            if p_value <= stats['critical_value']:
                last_accepted_freq = test_freq
            else:
                last_non_accepted_freq = test_freq

        stats['epsilon'] = last_non_accepted_freq + ((last_accepted_freq -
            last_non_accepted_freq) / 2) - min_freq
        stats['removed'] = len(intersection) - len(trueFIs)
    else: # stats['exp_res_filtered'] == 0
        stats['eval_res'] = 0
        stats['holdout_false_negatives'] = 0
        stats['holdout_intersection'] = 0
        stats['critical_value'] = 0
        stats['epsilon'] = 0
        stats['removed'] = 0

    return (trueFIs, stats)
예제 #3
0
def get_trueFIs(ds_stats,
                res_filename,
                min_freq,
                delta,
                pvalue_mode,
                use_additional_knowledge=False):
    """ Compute the True Frequent Itemsets using the Binomial test with a
    Bonferroni correction.

    The p-values for the Binomial tests are computed using the mode specified
    by pvalue_mode: 'c' for Chernoff, 'e' for exact, 'w' for weak Chernoff. The
    parameter 'use_additional_knowledge' can be used to incorporate additional
    knowledge about the data generation process.

    Returns a pair (trueFIs, stats).
    'trueFIs' is a dict whose keys are itemsets (frozensets) and values are
    frequencies. This collection of itemsets contains only TFIs with
    probability at least 1 - delta.
    'stats' is a dict containing various statistics used in computing the
    collection of itemsets."""

    stats = dict()

    sample_res = utils.create_results(res_filename, min_freq)

    # We work in the log-space
    stats['union_bound_factor'] = ds_stats['numitems'] * math.log(2.0)
    if use_additional_knowledge and \
            ds_stats['numitems'] > 2 * ds_stats['maxlen']:
        stats['union_bound_factor'] = \
            utils.get_union_bound_factor(ds_stats['numitems'],
                                         2 * ds_stats['maxlen'])

    # Bonferroni correction (Union bound)
    stats['critical_value'] = math.log(delta) - stats['union_bound_factor']
    supposed_freq = (math.ceil(ds_stats['size'] * min_freq) - 1) / \
        ds_stats['size']
    trueFIs = dict()
    last_accepted_freq = 1.0
    last_non_accepted_freq = min_freq
    for itemset in sorted(sample_res.keys(),
                          key=lambda x: sample_res[x],
                          reverse=True):
        p_value = utils.pvalue(pvalue_mode, sample_res[itemset],
                               ds_stats['size'], supposed_freq)
        if p_value <= stats['critical_value']:
            trueFIs[itemset] = sample_res[itemset]
            last_accepted_freq = sample_res[itemset]
        else:
            # Compute epsilon for the binomial
            last_non_accepted_freq = sample_res[itemset]
            break

    min_diff = 1e-5  # controls when to stop the binary search
    while last_accepted_freq - last_non_accepted_freq > min_diff:
        mid_point = (last_accepted_freq - last_non_accepted_freq) / 2
        test_freq = last_non_accepted_freq + mid_point
        p_value = utils.pvalue(pvalue_mode, test_freq, ds_stats['size'],
                               supposed_freq)
        if p_value <= stats['critical_value']:
            last_accepted_freq = test_freq
        else:
            last_non_accepted_freq = test_freq

    stats['epsilon'] = last_non_accepted_freq + \
        ((last_accepted_freq - last_non_accepted_freq) / 2) - min_freq
    stats['removed'] = len(sample_res) - len(trueFIs)

    return (trueFIs, stats)
예제 #4
0
def get_trueFIs(exp_res_filename,
                eval_res_filename,
                min_freq,
                delta,
                pvalue_mode,
                first_epsilon=1.0):
    """ Compute the True Frequent Itemsets using the 'holdout-VC' method with
    the binomial test

    TODO Add more details."""

    stats = dict()

    with open(exp_res_filename) as FILE:
        size_line = FILE.readline()
        try:
            size_str = size_line.split("(")[1].split(")")[0]
        except IndexError:
            utils.error_exit(
                "Cannot compute size of the explore dataset: '{}' is not in the recognized format\n"
                .format(size_line))
        try:
            stats['exp_size'] = int(size_str)
        except ValueError:
            utils.error_exit(
                "Cannot compute size of the explore dataset: {} is not a number\n"
                .format(size_str))

    with open(eval_res_filename) as FILE:
        size_line = FILE.readline()
        try:
            size_str = size_line.split("(")[1].split(")")[0]
        except IndexError:
            utils.error_exit(
                "Cannot compute size of the eval dataset: '{}' is not in the recognized format\n"
                .format(size_line))
        try:
            stats['eval_size'] = int(size_str)
        except ValueError:
            utils.error_exit(
                "Cannot compute size of the eval dataset: '{}' is not a number\n"
                .format(size_str))

    stats['orig_size'] = stats['exp_size'] + stats['eval_size']

    exp_res = utils.create_results(exp_res_filename, min_freq)
    stats['exp_res'] = len(exp_res)
    exp_res_set = set(exp_res.keys())
    eval_res = utils.create_results(eval_res_filename, min_freq)
    stats['eval_res'] = len(eval_res)
    eval_res_set = set(eval_res.keys())
    intersection = exp_res_set & eval_res_set
    stats['holdout_intersection'] = len(intersection)
    stats['holdout_false_negatives'] = len(exp_res_set - eval_res_set)
    stats['holdout_false_positives'] = len(eval_res_set - exp_res_set)
    stats['holdout_jaccard'] = len(intersection) / len(exp_res_set
                                                       | eval_res_set)

    # One may want to play with giving different values for the different error
    # probabilities, but there isn't really much point in it.
    stats['lowered_delta'] = 1.0 - math.sqrt(1 - delta)

    stats['filter_epsilon'] = first_epsilon

    sys.stderr.write("Computing candidates...")
    sys.stderr.flush()
    freq_bound = min_freq + stats['filter_epsilon']
    exp_res_filtered = set()
    exp_res_filtered_items = set()
    trueFIs = dict()
    for itemset in exp_res:
        if exp_res[itemset] < freq_bound:
            exp_res_filtered.add(itemset)
            exp_res_filtered_items |= itemset
        else:
            # Add itemsets with frequency at last freq_bound to the TFIs
            trueFIs[itemset] = exp_res[itemset]
    sys.stderr.write("done: {} exp_res_filtered ({} items)\n".format(
        len(exp_res_filtered), len(exp_res_filtered_items)))
    sys.stderr.flush()
    stats['tfis_from_exp'] = len(trueFIs)
    stats['exp_res_filtered'] = len(exp_res_filtered)

    supposed_freq = (math.ceil(stats['orig_size'] * min_freq) -
                     1) / stats['orig_size']
    if stats['exp_res_filtered'] > 0:
        eval_res = utils.create_results(eval_res_filename, min_freq)
        eval_res_set = set(eval_res.keys())
        stats['eval_res'] = len(eval_res)

        intersection = exp_res_filtered & eval_res_set
        stats['holdout_intersection'] = len(intersection)
        stats['holdout_false_negatives'] = len(exp_res_filtered - eval_res_set)

        # Bonferroni correction (Union bound). We work in the log space.
        stats['critical_value'] = math.log(stats['lowered_delta']) - math.log(
            stats['exp_res_filtered'])

        # Add TFIs from eval
        last_accepted_freq = 1.0
        last_non_accepted_freq = min_freq
        for itemset in sorted(intersection,
                              key=lambda x: eval_res[x],
                              reverse=True):
            p_value = utils.pvalue(pvalue_mode, eval_res[itemset],
                                   stats['eval_size'], supposed_freq)
            if p_value <= stats['critical_value']:
                trueFIs[itemset] = eval_res[itemset]
                last_accepted_freq = eval_res[itemset]
            else:
                last_non_accepted_freq = eval_res[itemset]
                break

        # Compute epsilon for the binomial
        min_diff = 5e-6  # controls when to stop the binary search
        while last_accepted_freq - last_non_accepted_freq > min_diff:
            mid_point = (last_accepted_freq - last_non_accepted_freq) / 2
            test_freq = last_non_accepted_freq + mid_point
            p_value = utils.pvalue(pvalue_mode, test_freq, stats['eval_size'],
                                   supposed_freq)
            if p_value <= stats['critical_value']:
                last_accepted_freq = test_freq
            else:
                last_non_accepted_freq = test_freq

        stats['epsilon'] = last_non_accepted_freq + (
            (last_accepted_freq - last_non_accepted_freq) / 2) - min_freq
        stats['removed'] = len(intersection) - len(trueFIs)
    else:  # stats['exp_res_filtered'] == 0
        stats['eval_res'] = 0
        stats['holdout_false_negatives'] = 0
        stats['holdout_intersection'] = 0
        stats['critical_value'] = 0
        stats['epsilon'] = 0
        stats['removed'] = 0

    return (trueFIs, stats)
예제 #5
0
def get_trueFIs(ds_stats, res_filename, min_freq, delta, pvalue_mode,
                use_additional_knowledge=False):
    """ Compute the True Frequent Itemsets using the Binomial test with a
    Bonferroni correction.

    The p-values for the Binomial tests are computed using the mode specified
    by pvalue_mode: 'c' for Chernoff, 'e' for exact, 'w' for weak Chernoff. The
    parameter 'use_additional_knowledge' can be used to incorporate additional
    knowledge about the data generation process.

    Returns a pair (trueFIs, stats).
    'trueFIs' is a dict whose keys are itemsets (frozensets) and values are
    frequencies. This collection of itemsets contains only TFIs with
    probability at least 1 - delta.
    'stats' is a dict containing various statistics used in computing the
    collection of itemsets."""

    stats = dict()

    sample_res = utils.create_results(res_filename, min_freq)

    # We work in the log-space
    stats['union_bound_factor'] = ds_stats['numitems'] * math.log(2.0)
    if use_additional_knowledge and \
            ds_stats['numitems'] > 2 * ds_stats['maxlen']:
        stats['union_bound_factor'] = \
            utils.get_union_bound_factor(ds_stats['numitems'],
                                         2 * ds_stats['maxlen'])

    # Bonferroni correction (Union bound)
    stats['critical_value'] = math.log(delta) - stats['union_bound_factor']
    supposed_freq = (math.ceil(ds_stats['size'] * min_freq) - 1) / \
        ds_stats['size']
    trueFIs = dict()
    last_accepted_freq = 1.0
    last_non_accepted_freq = min_freq
    for itemset in sorted(sample_res.keys(), key=lambda x: sample_res[x],
                          reverse=True):
        p_value = utils.pvalue(pvalue_mode, sample_res[itemset],
                               ds_stats['size'], supposed_freq)
        if p_value <= stats['critical_value']:
            trueFIs[itemset] = sample_res[itemset]
            last_accepted_freq = sample_res[itemset]
        else:
            # Compute epsilon for the binomial
            last_non_accepted_freq = sample_res[itemset]
            break

    min_diff = 1e-5  # controls when to stop the binary search
    while last_accepted_freq - last_non_accepted_freq > min_diff:
        mid_point = (last_accepted_freq - last_non_accepted_freq) / 2
        test_freq = last_non_accepted_freq + mid_point
        p_value = utils.pvalue(pvalue_mode, test_freq,
                               ds_stats['size'], supposed_freq)
        if p_value <= stats['critical_value']:
            last_accepted_freq = test_freq
        else:
            last_non_accepted_freq = test_freq

    stats['epsilon'] = last_non_accepted_freq + \
        ((last_accepted_freq - last_non_accepted_freq) / 2) - min_freq
    stats['removed'] = len(sample_res) - len(trueFIs)

    return (trueFIs, stats)
예제 #6
0
def get_trueFIs(exp_res_filename, eval_res_filename, min_freq, delta,
        pvalue_mode, do_filter=0):
    """ Compute the True Frequent Itemsets using the holdout method.

    The holdout method is described in Geoffrey I. Webb, "Discovering
    significant patterns" in Machine Learning, Vol. 68, Issue (1), pp. 1-3,
    2007.

    The dataset is split in two parts, an exploratory part and an evaluation
    part. Each are mined separately at frequency 'min_freq'. The results are
    contained in 'exp_res_filename' and 'eval_res_filename' respectively.
    The parameter 'do_filter' controls a variant of the algorithm where the
    results from the exploratory part are filtered more.

    The p-values for the Binomial tests are computed using the mode specified
    by pvalue_mode: 'c' for Chernoff, 'e' for exact, or 'w' for weak Chernoff.
    The parameter 'use_additional_knowledge' can be used to incorporate
    additional knowledge about the data generation process.

    Returns a pair (trueFIs, stats).
    'trueFIs' is a dict whose keys are itemsets (frozensets) and values are
    frequencies. This collection of itemsets contains only TFIs with
    probability at least 1 - delta.
    'stats' is a dict containing various statistics used in computing the
    collection of itemsets."""

    stats = dict()

    with open(exp_res_filename) as FILE:
        size_line = FILE.readline()
        try:
            size_str = size_line.split("(")[1].split(")")[0]
        except IndexError:
            utils.error_exit(
                " ".join(
                    ("Cannot compute size of the explore dataset:",
                        "'{}' is not in a recognized format\n".format(
                            size_line))))
        try:
            stats['exp_size'] = int(size_str)
        except ValueError:
            utils.error_exit(
                " ".join(
                    ("Cannot compute size of the explore dataset:",
                     "'{}' is not a number\n".format(size_str))))

    with open(eval_res_filename) as FILE:
        size_line = FILE.readline()
        try:
            size_str = size_line.split("(")[1].split(")")[0]
        except IndexError:
            utils.error_exit(
                " ".join(
                    ("Cannot compute size of the eval dataset:",
                     "'{}' is not in a recognized format\n".format(
                         size_line))))
        try:
            stats['eval_size'] = int(size_str)
        except ValueError:
            utils.error_exit(
                " ".join(
                    "Cannot compute size of the eval dataset:",
                    "'{}' is not a number\n".format(size_str)))

    stats['orig_size'] = stats['exp_size'] + stats['eval_size']

    exp_res = utils.create_results(exp_res_filename, min_freq)
    stats['exp_res'] = len(exp_res)

    trueFIs = dict()

    supposed_freq = (math.ceil( stats['orig_size'] * min_freq) - 1) / stats['orig_size']
    stats['filter_critical_value'] = 0
    if do_filter > 0:
        stats['lowered_delta'] = 1 - math.sqrt(1 - delta)
        exp_res_filtered = dict()
        stats['filter_critical_value'] = math.log(stats['lowered_delta']) - do_filter
        last_accepted_freq = 1.0
        last_non_accepted_freq = 0.0
        for itemset in exp_res:
            if utils.pvalue(pvalue_mode, exp_res[itemset], stats['exp_size'],
                    supposed_freq) <= stats['filter_critical_value']:
                trueFIs[itemset] = exp_res[itemset]
                if exp_res[itemset] < last_accepted_freq:
                    last_accepted_freq = exp_res[itemset]
            else:
                exp_res_filtered[itemset] = exp_res[itemset]
                if exp_res[itemset] > last_non_accepted_freq:
                    last_non_accepted_freq = exp_res[itemset]
        # Compute epsilon for the binomial
        min_diff = 5e-6 # controls when to stop the binary search
        while last_accepted_freq - last_non_accepted_freq > min_diff:
            mid_point = (last_accepted_freq - last_non_accepted_freq) / 2
            test_freq = last_non_accepted_freq + mid_point
            p_value = utils.pvalue(pvalue_mode, test_freq,
                    stats['eval_size'], supposed_freq)
            if p_value <= stats['filter_critical_value']:
                last_accepted_freq = test_freq
            else:
                last_non_accepted_freq = test_freq
        stats['filter_epsilon'] = last_non_accepted_freq + ((last_accepted_freq - last_non_accepted_freq) / 2) - min_freq
    else:
        stats['lowered_delta'] = delta
        exp_res_filtered = exp_res
        stats['filter_epsilon'] = 1.0
    exp_res_filtered_set = set(exp_res_filtered.keys())
    stats['exp_res_filtered'] = len(exp_res_filtered_set)
    stats['tfis_from_exp'] = len(trueFIs)
    sys.stderr.write("do_filter: {}, tfis_from_exp: {}, exp_res_filtered: {}\n".format(do_filter, stats['tfis_from_exp'], stats['exp_res_filtered']))

    if stats['exp_res_filtered'] > 0:
        eval_res = utils.create_results(eval_res_filename, min_freq)
        eval_res_set = set(eval_res.keys())
        stats['eval_res'] = len(eval_res)

        intersection = exp_res_filtered_set & eval_res_set
        stats['holdout_intersection'] = len(intersection)
        stats['holdout_false_negatives'] = len(exp_res_filtered_set - eval_res_set)

        # Bonferroni correction (Union bound). We work in the log space.
        stats['critical_value'] = math.log(stats['lowered_delta']) - math.log(stats['exp_res_filtered'])

        # Add TFIs from eval
        last_accepted_freq = 1.0
        last_non_accepted_freq = min_freq
        for itemset in sorted(intersection, key=lambda x : eval_res[x], reverse=True):
            p_value = utils.pvalue(pvalue_mode, eval_res[itemset],
                    stats['eval_size'], supposed_freq)
            if p_value <= stats['critical_value']:
                trueFIs[itemset] = eval_res[itemset]
                last_accepted_freq = eval_res[itemset]
            else:
                last_non_accepted_freq = eval_res[itemset]
                break

        # Compute epsilon for the binomial
        min_diff = 5e-6 # controls when to stop the binary search
        while last_accepted_freq - last_non_accepted_freq > min_diff:
            mid_point = (last_accepted_freq - last_non_accepted_freq) / 2
            test_freq = last_non_accepted_freq + mid_point
            p_value = utils.pvalue(pvalue_mode, test_freq,
                    stats['eval_size'], supposed_freq)
            if p_value <= stats['critical_value']:
                last_accepted_freq = test_freq
            else:
                last_non_accepted_freq = test_freq

        stats['epsilon'] = last_non_accepted_freq + ((last_accepted_freq -
            last_non_accepted_freq) / 2) - min_freq
        stats['removed'] = len(intersection) - len(trueFIs)
    else: # stats['exp_res_filtered'] == 0
        stats['eval_res'] = 0
        stats['holdout_false_negatives'] = 0
        stats['holdout_intersection'] = 0
        stats['critical_value'] = 0
        stats['epsilon'] = 0
        stats['removed'] = 0

    return (trueFIs, stats)