def plot_perm_diffs(samps, actual=None, bka=2, bkb=1, subplt=1, xlabel=None): t = ('Simulated and actual difference between books {bka} and {bkb}' '\nPermutation pvalue: {pv:.3%}; N={N:,.0f}' .format(bka=bka, bkb=bkb, pv=ut.pvalue(actual, samps), N=len(samps))) plt.subplot(1, 2, subplt, title=t) samps.hist(bins=50) plt.vlines(actual, *plt.ylim()) plt.legend(['Actual\ndifference'], loc=2) if xlabel is not None: plt.xlabel(xlabel)
def get_trueFIs(exp_res_filename, eval_res_filename, min_freq, delta, pvalue_mode, first_epsilon=1.0): """ Compute the True Frequent Itemsets using the 'holdout-VC' method with the binomial test TODO Add more details.""" stats = dict() with open(exp_res_filename) as FILE: size_line = FILE.readline() try: size_str = size_line.split("(")[1].split(")")[0] except IndexError: utils.error_exit("Cannot compute size of the explore dataset: '{}' is not in the recognized format\n".format(size_line)) try: stats['exp_size'] = int(size_str) except ValueError: utils.error_exit("Cannot compute size of the explore dataset: {} is not a number\n".format(size_str)) with open(eval_res_filename) as FILE: size_line = FILE.readline() try: size_str = size_line.split("(")[1].split(")")[0] except IndexError: utils.error_exit("Cannot compute size of the eval dataset: '{}' is not in the recognized format\n".format(size_line)) try: stats['eval_size'] = int(size_str) except ValueError: utils.error_exit("Cannot compute size of the eval dataset: '{}' is not a number\n".format(size_str)) stats['orig_size'] = stats['exp_size'] + stats['eval_size'] exp_res = utils.create_results(exp_res_filename, min_freq) stats['exp_res'] = len(exp_res) exp_res_set = set(exp_res.keys()) eval_res = utils.create_results(eval_res_filename, min_freq) stats['eval_res'] = len(eval_res) eval_res_set = set(eval_res.keys()) intersection = exp_res_set & eval_res_set stats['holdout_intersection'] = len(intersection) stats['holdout_false_negatives'] = len(exp_res_set - eval_res_set) stats['holdout_false_positives'] = len(eval_res_set - exp_res_set) stats['holdout_jaccard'] = len(intersection) / len(exp_res_set | eval_res_set) # One may want to play with giving different values for the different error # probabilities, but there isn't really much point in it. stats['lowered_delta'] = 1.0 - math.sqrt(1 - delta) stats['filter_epsilon'] = first_epsilon sys.stderr.write("Computing candidates...") sys.stderr.flush() freq_bound = min_freq + stats['filter_epsilon'] exp_res_filtered = set() exp_res_filtered_items = set() trueFIs = dict() for itemset in exp_res: if exp_res[itemset] < freq_bound: exp_res_filtered.add(itemset) exp_res_filtered_items |= itemset else: # Add itemsets with frequency at last freq_bound to the TFIs trueFIs[itemset] = exp_res[itemset] sys.stderr.write("done: {} exp_res_filtered ({} items)\n".format(len(exp_res_filtered), len(exp_res_filtered_items))) sys.stderr.flush() stats['tfis_from_exp'] = len(trueFIs) stats['exp_res_filtered'] = len(exp_res_filtered) supposed_freq = (math.ceil( stats['orig_size'] * min_freq) - 1) / stats['orig_size'] if stats['exp_res_filtered'] > 0: eval_res = utils.create_results(eval_res_filename, min_freq) eval_res_set = set(eval_res.keys()) stats['eval_res'] = len(eval_res) intersection = exp_res_filtered & eval_res_set stats['holdout_intersection'] = len(intersection) stats['holdout_false_negatives'] = len(exp_res_filtered - eval_res_set) # Bonferroni correction (Union bound). We work in the log space. stats['critical_value'] = math.log(stats['lowered_delta']) - math.log(stats['exp_res_filtered']) # Add TFIs from eval last_accepted_freq = 1.0 last_non_accepted_freq = min_freq for itemset in sorted(intersection, key=lambda x : eval_res[x], reverse=True): p_value = utils.pvalue(pvalue_mode, eval_res[itemset], stats['eval_size'], supposed_freq) if p_value <= stats['critical_value']: trueFIs[itemset] = eval_res[itemset] last_accepted_freq = eval_res[itemset] else: last_non_accepted_freq = eval_res[itemset] break # Compute epsilon for the binomial min_diff = 5e-6 # controls when to stop the binary search while last_accepted_freq - last_non_accepted_freq > min_diff: mid_point = (last_accepted_freq - last_non_accepted_freq) / 2 test_freq = last_non_accepted_freq + mid_point p_value = utils.pvalue(pvalue_mode, test_freq, stats['eval_size'], supposed_freq) if p_value <= stats['critical_value']: last_accepted_freq = test_freq else: last_non_accepted_freq = test_freq stats['epsilon'] = last_non_accepted_freq + ((last_accepted_freq - last_non_accepted_freq) / 2) - min_freq stats['removed'] = len(intersection) - len(trueFIs) else: # stats['exp_res_filtered'] == 0 stats['eval_res'] = 0 stats['holdout_false_negatives'] = 0 stats['holdout_intersection'] = 0 stats['critical_value'] = 0 stats['epsilon'] = 0 stats['removed'] = 0 return (trueFIs, stats)
def get_trueFIs(ds_stats, res_filename, min_freq, delta, pvalue_mode, use_additional_knowledge=False): """ Compute the True Frequent Itemsets using the Binomial test with a Bonferroni correction. The p-values for the Binomial tests are computed using the mode specified by pvalue_mode: 'c' for Chernoff, 'e' for exact, 'w' for weak Chernoff. The parameter 'use_additional_knowledge' can be used to incorporate additional knowledge about the data generation process. Returns a pair (trueFIs, stats). 'trueFIs' is a dict whose keys are itemsets (frozensets) and values are frequencies. This collection of itemsets contains only TFIs with probability at least 1 - delta. 'stats' is a dict containing various statistics used in computing the collection of itemsets.""" stats = dict() sample_res = utils.create_results(res_filename, min_freq) # We work in the log-space stats['union_bound_factor'] = ds_stats['numitems'] * math.log(2.0) if use_additional_knowledge and \ ds_stats['numitems'] > 2 * ds_stats['maxlen']: stats['union_bound_factor'] = \ utils.get_union_bound_factor(ds_stats['numitems'], 2 * ds_stats['maxlen']) # Bonferroni correction (Union bound) stats['critical_value'] = math.log(delta) - stats['union_bound_factor'] supposed_freq = (math.ceil(ds_stats['size'] * min_freq) - 1) / \ ds_stats['size'] trueFIs = dict() last_accepted_freq = 1.0 last_non_accepted_freq = min_freq for itemset in sorted(sample_res.keys(), key=lambda x: sample_res[x], reverse=True): p_value = utils.pvalue(pvalue_mode, sample_res[itemset], ds_stats['size'], supposed_freq) if p_value <= stats['critical_value']: trueFIs[itemset] = sample_res[itemset] last_accepted_freq = sample_res[itemset] else: # Compute epsilon for the binomial last_non_accepted_freq = sample_res[itemset] break min_diff = 1e-5 # controls when to stop the binary search while last_accepted_freq - last_non_accepted_freq > min_diff: mid_point = (last_accepted_freq - last_non_accepted_freq) / 2 test_freq = last_non_accepted_freq + mid_point p_value = utils.pvalue(pvalue_mode, test_freq, ds_stats['size'], supposed_freq) if p_value <= stats['critical_value']: last_accepted_freq = test_freq else: last_non_accepted_freq = test_freq stats['epsilon'] = last_non_accepted_freq + \ ((last_accepted_freq - last_non_accepted_freq) / 2) - min_freq stats['removed'] = len(sample_res) - len(trueFIs) return (trueFIs, stats)
def get_trueFIs(exp_res_filename, eval_res_filename, min_freq, delta, pvalue_mode, first_epsilon=1.0): """ Compute the True Frequent Itemsets using the 'holdout-VC' method with the binomial test TODO Add more details.""" stats = dict() with open(exp_res_filename) as FILE: size_line = FILE.readline() try: size_str = size_line.split("(")[1].split(")")[0] except IndexError: utils.error_exit( "Cannot compute size of the explore dataset: '{}' is not in the recognized format\n" .format(size_line)) try: stats['exp_size'] = int(size_str) except ValueError: utils.error_exit( "Cannot compute size of the explore dataset: {} is not a number\n" .format(size_str)) with open(eval_res_filename) as FILE: size_line = FILE.readline() try: size_str = size_line.split("(")[1].split(")")[0] except IndexError: utils.error_exit( "Cannot compute size of the eval dataset: '{}' is not in the recognized format\n" .format(size_line)) try: stats['eval_size'] = int(size_str) except ValueError: utils.error_exit( "Cannot compute size of the eval dataset: '{}' is not a number\n" .format(size_str)) stats['orig_size'] = stats['exp_size'] + stats['eval_size'] exp_res = utils.create_results(exp_res_filename, min_freq) stats['exp_res'] = len(exp_res) exp_res_set = set(exp_res.keys()) eval_res = utils.create_results(eval_res_filename, min_freq) stats['eval_res'] = len(eval_res) eval_res_set = set(eval_res.keys()) intersection = exp_res_set & eval_res_set stats['holdout_intersection'] = len(intersection) stats['holdout_false_negatives'] = len(exp_res_set - eval_res_set) stats['holdout_false_positives'] = len(eval_res_set - exp_res_set) stats['holdout_jaccard'] = len(intersection) / len(exp_res_set | eval_res_set) # One may want to play with giving different values for the different error # probabilities, but there isn't really much point in it. stats['lowered_delta'] = 1.0 - math.sqrt(1 - delta) stats['filter_epsilon'] = first_epsilon sys.stderr.write("Computing candidates...") sys.stderr.flush() freq_bound = min_freq + stats['filter_epsilon'] exp_res_filtered = set() exp_res_filtered_items = set() trueFIs = dict() for itemset in exp_res: if exp_res[itemset] < freq_bound: exp_res_filtered.add(itemset) exp_res_filtered_items |= itemset else: # Add itemsets with frequency at last freq_bound to the TFIs trueFIs[itemset] = exp_res[itemset] sys.stderr.write("done: {} exp_res_filtered ({} items)\n".format( len(exp_res_filtered), len(exp_res_filtered_items))) sys.stderr.flush() stats['tfis_from_exp'] = len(trueFIs) stats['exp_res_filtered'] = len(exp_res_filtered) supposed_freq = (math.ceil(stats['orig_size'] * min_freq) - 1) / stats['orig_size'] if stats['exp_res_filtered'] > 0: eval_res = utils.create_results(eval_res_filename, min_freq) eval_res_set = set(eval_res.keys()) stats['eval_res'] = len(eval_res) intersection = exp_res_filtered & eval_res_set stats['holdout_intersection'] = len(intersection) stats['holdout_false_negatives'] = len(exp_res_filtered - eval_res_set) # Bonferroni correction (Union bound). We work in the log space. stats['critical_value'] = math.log(stats['lowered_delta']) - math.log( stats['exp_res_filtered']) # Add TFIs from eval last_accepted_freq = 1.0 last_non_accepted_freq = min_freq for itemset in sorted(intersection, key=lambda x: eval_res[x], reverse=True): p_value = utils.pvalue(pvalue_mode, eval_res[itemset], stats['eval_size'], supposed_freq) if p_value <= stats['critical_value']: trueFIs[itemset] = eval_res[itemset] last_accepted_freq = eval_res[itemset] else: last_non_accepted_freq = eval_res[itemset] break # Compute epsilon for the binomial min_diff = 5e-6 # controls when to stop the binary search while last_accepted_freq - last_non_accepted_freq > min_diff: mid_point = (last_accepted_freq - last_non_accepted_freq) / 2 test_freq = last_non_accepted_freq + mid_point p_value = utils.pvalue(pvalue_mode, test_freq, stats['eval_size'], supposed_freq) if p_value <= stats['critical_value']: last_accepted_freq = test_freq else: last_non_accepted_freq = test_freq stats['epsilon'] = last_non_accepted_freq + ( (last_accepted_freq - last_non_accepted_freq) / 2) - min_freq stats['removed'] = len(intersection) - len(trueFIs) else: # stats['exp_res_filtered'] == 0 stats['eval_res'] = 0 stats['holdout_false_negatives'] = 0 stats['holdout_intersection'] = 0 stats['critical_value'] = 0 stats['epsilon'] = 0 stats['removed'] = 0 return (trueFIs, stats)
def get_trueFIs(exp_res_filename, eval_res_filename, min_freq, delta, pvalue_mode, do_filter=0): """ Compute the True Frequent Itemsets using the holdout method. The holdout method is described in Geoffrey I. Webb, "Discovering significant patterns" in Machine Learning, Vol. 68, Issue (1), pp. 1-3, 2007. The dataset is split in two parts, an exploratory part and an evaluation part. Each are mined separately at frequency 'min_freq'. The results are contained in 'exp_res_filename' and 'eval_res_filename' respectively. The parameter 'do_filter' controls a variant of the algorithm where the results from the exploratory part are filtered more. The p-values for the Binomial tests are computed using the mode specified by pvalue_mode: 'c' for Chernoff, 'e' for exact, or 'w' for weak Chernoff. The parameter 'use_additional_knowledge' can be used to incorporate additional knowledge about the data generation process. Returns a pair (trueFIs, stats). 'trueFIs' is a dict whose keys are itemsets (frozensets) and values are frequencies. This collection of itemsets contains only TFIs with probability at least 1 - delta. 'stats' is a dict containing various statistics used in computing the collection of itemsets.""" stats = dict() with open(exp_res_filename) as FILE: size_line = FILE.readline() try: size_str = size_line.split("(")[1].split(")")[0] except IndexError: utils.error_exit( " ".join( ("Cannot compute size of the explore dataset:", "'{}' is not in a recognized format\n".format( size_line)))) try: stats['exp_size'] = int(size_str) except ValueError: utils.error_exit( " ".join( ("Cannot compute size of the explore dataset:", "'{}' is not a number\n".format(size_str)))) with open(eval_res_filename) as FILE: size_line = FILE.readline() try: size_str = size_line.split("(")[1].split(")")[0] except IndexError: utils.error_exit( " ".join( ("Cannot compute size of the eval dataset:", "'{}' is not in a recognized format\n".format( size_line)))) try: stats['eval_size'] = int(size_str) except ValueError: utils.error_exit( " ".join( "Cannot compute size of the eval dataset:", "'{}' is not a number\n".format(size_str))) stats['orig_size'] = stats['exp_size'] + stats['eval_size'] exp_res = utils.create_results(exp_res_filename, min_freq) stats['exp_res'] = len(exp_res) trueFIs = dict() supposed_freq = (math.ceil( stats['orig_size'] * min_freq) - 1) / stats['orig_size'] stats['filter_critical_value'] = 0 if do_filter > 0: stats['lowered_delta'] = 1 - math.sqrt(1 - delta) exp_res_filtered = dict() stats['filter_critical_value'] = math.log(stats['lowered_delta']) - do_filter last_accepted_freq = 1.0 last_non_accepted_freq = 0.0 for itemset in exp_res: if utils.pvalue(pvalue_mode, exp_res[itemset], stats['exp_size'], supposed_freq) <= stats['filter_critical_value']: trueFIs[itemset] = exp_res[itemset] if exp_res[itemset] < last_accepted_freq: last_accepted_freq = exp_res[itemset] else: exp_res_filtered[itemset] = exp_res[itemset] if exp_res[itemset] > last_non_accepted_freq: last_non_accepted_freq = exp_res[itemset] # Compute epsilon for the binomial min_diff = 5e-6 # controls when to stop the binary search while last_accepted_freq - last_non_accepted_freq > min_diff: mid_point = (last_accepted_freq - last_non_accepted_freq) / 2 test_freq = last_non_accepted_freq + mid_point p_value = utils.pvalue(pvalue_mode, test_freq, stats['eval_size'], supposed_freq) if p_value <= stats['filter_critical_value']: last_accepted_freq = test_freq else: last_non_accepted_freq = test_freq stats['filter_epsilon'] = last_non_accepted_freq + ((last_accepted_freq - last_non_accepted_freq) / 2) - min_freq else: stats['lowered_delta'] = delta exp_res_filtered = exp_res stats['filter_epsilon'] = 1.0 exp_res_filtered_set = set(exp_res_filtered.keys()) stats['exp_res_filtered'] = len(exp_res_filtered_set) stats['tfis_from_exp'] = len(trueFIs) sys.stderr.write("do_filter: {}, tfis_from_exp: {}, exp_res_filtered: {}\n".format(do_filter, stats['tfis_from_exp'], stats['exp_res_filtered'])) if stats['exp_res_filtered'] > 0: eval_res = utils.create_results(eval_res_filename, min_freq) eval_res_set = set(eval_res.keys()) stats['eval_res'] = len(eval_res) intersection = exp_res_filtered_set & eval_res_set stats['holdout_intersection'] = len(intersection) stats['holdout_false_negatives'] = len(exp_res_filtered_set - eval_res_set) # Bonferroni correction (Union bound). We work in the log space. stats['critical_value'] = math.log(stats['lowered_delta']) - math.log(stats['exp_res_filtered']) # Add TFIs from eval last_accepted_freq = 1.0 last_non_accepted_freq = min_freq for itemset in sorted(intersection, key=lambda x : eval_res[x], reverse=True): p_value = utils.pvalue(pvalue_mode, eval_res[itemset], stats['eval_size'], supposed_freq) if p_value <= stats['critical_value']: trueFIs[itemset] = eval_res[itemset] last_accepted_freq = eval_res[itemset] else: last_non_accepted_freq = eval_res[itemset] break # Compute epsilon for the binomial min_diff = 5e-6 # controls when to stop the binary search while last_accepted_freq - last_non_accepted_freq > min_diff: mid_point = (last_accepted_freq - last_non_accepted_freq) / 2 test_freq = last_non_accepted_freq + mid_point p_value = utils.pvalue(pvalue_mode, test_freq, stats['eval_size'], supposed_freq) if p_value <= stats['critical_value']: last_accepted_freq = test_freq else: last_non_accepted_freq = test_freq stats['epsilon'] = last_non_accepted_freq + ((last_accepted_freq - last_non_accepted_freq) / 2) - min_freq stats['removed'] = len(intersection) - len(trueFIs) else: # stats['exp_res_filtered'] == 0 stats['eval_res'] = 0 stats['holdout_false_negatives'] = 0 stats['holdout_intersection'] = 0 stats['critical_value'] = 0 stats['epsilon'] = 0 stats['removed'] = 0 return (trueFIs, stats)