def filter_items(self, M): singletons = {} pairs = {} triples = {} G = {} # TODO: look at this graph. does it have long adjacency lists? # build data structures for itemset, (support,) in M: l = len(itemset) if l == 1: singletons[itemset[0]] = support elif l == 2: sorted_pair = itemset[0] < itemset[1] and itemset or (itemset[1], itemset[0]) pairs[sorted_pair] = support self.__add_to_graph(sorted_pair, G) elif l == 3: triples[utils.triple_sort(itemset)] = support else: assert False, "frequent itemsets larger than 3 are not used" # graph_stats(G) # print 'singletons found: {}'.format(len(singletons)) # print 'pairs found: {}'.format(len(pairs)) # print 'triples found: {}'.format(len(triples)) res = [] # find triangles in graph for n1 in G.keys(): # assert singletons.has_key(n1), ("Pair cannot have infrequent singleton item", n1, sigletons) for n2 in G[n1]: # assert singletons.has_key(n2), ("Pair cannot have infrequent singleton item", n2, sigletons) # n2 does not necessarily have an adjencency list if not G.has_key(n2): continue for n3 in G[n2]: # assert singletons.has_key(n3), ("Pair cannot have infrequent singleton item", n3, sigletons) if n3 in G[n1]: # triangle s1 = (n1, singletons[n1]) s2 = (n2, singletons[n2]) s3 = (n3, singletons[n3]) s12 = ((n1,n2), pairs[(n1,n2)]) s23 = ((n2,n3), pairs[(n2,n3)]) s13 = ((n1,n3), pairs[(n1,n3)]) c = 0 if triples.has_key((n1,n2,n3)): c = triples[(n1,n2,n3)] s123 = ((n1,n2,n3), c) res.append((s1,s2,s3,s12,s23,s13,s123)) # print 'Triangles found: {}'.format(len(res)) return res, triples
def build_item_search_tree(tsvfile): """ Returns a tree data structure that can be used with the look up function. """ s1_pos = s2_pos = s3_pos = est_pos = None ds = {} for index, line in enumerate(open(tsvfile)): line = line.replace('\n', '') chunks = line.split('\t') if index == 0: s1_pos = chunks.index('n1') s2_pos = chunks.index('n2') s3_pos = chunks.index('n3') est_pos = chunks.index('est') else: s1, s2, s3, est = chunks[s1_pos], chunks[s2_pos], chunks[s3_pos], float(chunks[est_pos]) # We expect items to be sorted, but just in case s1, s2, s3 = triple_sort((s1, s2, s3)) if (s1, s2) in ds: ds[(s1, s2)].append((s3, est)) else: ds[(s1, s2)] = [(s3, est)] if (s1, s3) in ds: ds[(s1, s3)].append((s2, est)) else: ds[(s1, s3)] = [(s2, est)] if (s2, s3) in ds: ds[(s2, s3)].append((s1, est)) else: ds[(s2, s3)] = [(s1, est)] return ds
def triple_intervals(self, frequent_items_file, intervals): """ The funktion runs through a frequent items file produced by fp-growth. and filter the triplets out and sorts the triplets by frequncy. Finaly it devides the frequent itemsets into intervals of the triplets with a specific frequency range. """ triples = [] frequencies = set() for line in open(frequent_items_file, 'rb'): chunks = line.split() if len(chunks) < 4: continue itemset = tuple(chunks[:-1]) support = int(chunks[-1].replace('(','').replace(')','')) frequencies.add(support) triples.append((itemset, support)) triples.sort(lambda x,y: x[1]< y[1] and -1 or 1) frequencies_sorted = list(frequencies) frequencies_sorted.sort() # print 'frequencies_sorted: {}'.format(len(frequencies_sorted)) # print 'triples: ', len(triples) result = [] chunk_intervals = [] interval = len(frequencies_sorted) / intervals triple_index = 0 for c in xrange(0, len(frequencies_sorted), interval): triple_set = {} # subset of triples, with given frequency chunk = frequencies_sorted[c:c+interval] # Chunk of frequencies chunk_intervals.append((chunk[0], chunk[-1])) for freq in chunk: triple, support = triples[triple_index] while support == freq: triple_set[triple_sort(triple)] = support triple_index += 1 if triple_index < len(triples): triple, support = triples[triple_index] else: break result.append(triple_set) set_lengths = [len(l) for l in result] print "Set sizes: {}".format(set_lengths) print "Intervals: {}".format(chunk_intervals) return result
def est_all_data_disc_version(algorithm, tab_file, min_support=-30, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''): from subprocess import call from parsers import Borgelt cv_start = time() # Create work folder _id = str(time()).replace('.','') + '_' + extra_id path = '../tmp/cv_' + _id + '/' os.mkdir(path) print "\n### Running cross validation on ALL DATA cv_{}###".format(_id) total_transactions = 0 for line in open(tab_file, 'rb'): total_transactions += 1 print 'Total total_transactions: ', total_transactions sample_size = total_transactions avg_errors = [] var_errors = [] avg_errors_ext = [] var_errors_ext = [] avg_errors_heu = [] var_errors_heu = [] for index in range(iterations): borgelt_start = time() sample_freq_name = path + str(index) + '_sample_frequent_items.out' args = [algorithm, tab_file, sample_freq_name, '-s' + str(min_support), '-n3'] call(args) print 'fpgrowth on sample data (ALL DATA) done: {} secs'.format(time()-borgelt_start) freq = Borgelt.read_frequent_items(sample_freq_name) # Create ds of all observed triplets # Saved as sorted keys for lookup, # and their frequency as value observed = {} count = 0 for item in freq: if len(item[0]) == 3: sorted_trip = triple_sort(item[0]) # * 2, horrible hack to make Forward calculated the # observed frequency correctly. observed[sorted_trip] = item[1][0] * 2 print 'Total triplets observed:', len(observed) # Check any frequent items were found if not os.path.exists(sample_freq_name): print 'No frequent items found' print 'args', args continue min_support_trips = min_supported_trips(min_support, total_transactions) print 'Forward min_support_trips set to: ', min_support_trips triangles_start = time() triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples) print 'Found triangles done: {}'.format(time() - triangles_start) #del sample_freq estimates = [] extrapolations = [] heurestics = [] observations = [] triplets = [] MAPE_errors = [] MAPE_errors_ext = [] triangle_counts = [] triplets = [] pair_triple_ratios = [] # Recursion for estimate to converge req_depth = int(math.log(total_transactions, 2))+1 # DFS of the tree holding all triangles for n1 in triangle_tree.keys(): s1, s2_dict = triangle_tree[n1] for n2 in s2_dict.keys(): s2, s12, s3_dict = s2_dict[n2] for n3 in s3_dict.keys(): s3, s13, s23, s123 = s3_dict[n3] triangle = (n1, n2, n3) triplets.append(triangle) triangle_counts.append((s1, s2, s3, s12, s13, s23, s123)) pair_triple_ratio = s123 / float(min(s12, s13, s23)) pair_triple_ratios.append(pair_triple_ratio) # Observed is the triple support, since sample is all data obs = s123 # maxent estimate est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(total_transactions), num=req_depth) # extrapolation estimate, does not make sense for all data est2 = s123 / float(sample_size) * (total_transactions) # heurestic, use max_ent for 0 triple in sample, does not make sense for all data # est3 = s123 == 0 and est or est2 estimates.append(est) # extrapolations.append(est2) # heurestics.append(est3) observations.append(obs) triplets.append(triangle) # MAPE error max ent error = abs(obs-est) / math.sqrt(obs) MAPE_errors.append(error) # MAPE error extrapolation error2 = abs(obs-est2) / math.sqrt(obs) MAPE_errors_ext.append(error2) # MAPE error heurestic # error3 = abs(obs-est3) / float(obs) * 100 # MAPE_errors_heu.append(error3) del triangle_tree del sample_triples if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found min_error = min(MAPE_errors) max_error = max(MAPE_errors) # max ent error avg_error = sum(MAPE_errors) / float(len(MAPE_errors)) avg_errors.append(avg_error) # extrapolation error # avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext)) # avg_errors_ext.append(avg_error_ext) # heurestic error # avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu)) # avg_errors_heu.append(avg_error_heu) # variance var_error = var(MAPE_errors) # var_error_ext = tvar(MAPE_errors_ext) # var_error_heu = tvar(MAPE_errors_heu) # max_ent confidence interval std_dev = math.sqrt(var_error) std_error = std_dev / math.sqrt(sample_size) span_99 = norm.interval(0.99, avg_error, std_error) span_95 = norm.interval(0.95, avg_error, std_error) # ext confidence interval # std_dev_ext = math.sqrt(var_error_ext) # std_error_ext = std_dev_ext / math.sqrt(sample_size) # span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext) # span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext) # heurestic confidence interval # std_dev_heu = math.sqrt(var_error_heu) # std_error_heu = std_dev_heu / math.sqrt(sample_size) # span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu) # span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu) var_errors.append(var_error) # var_errors_ext.append(var_error_ext) # var_errors_heu.append(var_error_heu) res_string = "\nResult ALL DATA({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), sample_size) # log max ent result res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95)) res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext)) # res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu) # res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu)) # res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu)) with open(path + 'log.txt', 'a') as log_file: log_file.write(res_string) print res_string # Write result data with open(path + str(index) + '_data.json', 'w') as fd: # triplet_key = ['triple' for t in estimates] # est_key = ['est' for t in estimates] # obs_key = ['obs' for t in observations] fd.write(json.dumps(zip(triplets, zip(estimates, observations)))) with open(path + str(index) + '_data.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') del estimates del observations # remove tmp files # os.remove(sample_freq_name) # os.remove(sample_file_name) else: print 'No abs errors!' print "Cross validation done!" print "time: ", (time() - cv_start) if len(avg_errors) > 0: total_avg_error = sum(avg_errors)/float(len(avg_errors)) total_res_string = "Avg error:{}".format(total_avg_error)
def cross_validate_disc_version(algorithm, tab_file, min_support=-30, sample_pct=0.1, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''): from subprocess import call from parsers import Borgelt cv_start = time() # Create work folder _id = str(time()).replace('.','') + '_' + extra_id path = '../tmp/cv_' + _id + '/' os.mkdir(path) print "\n### Running cross validation cv_{}###".format(_id) total_transactions = 0 for line in open(tab_file, 'rb'): total_transactions += 1 print 'Total total_transactions: ', total_transactions # Get the total observed triples borgelt_start = time() observed_file_name = path + 'observed_frequent_items.out' args = [algorithm, tab_file, observed_file_name, '-s' + str(min_support), '-n3'] # pro = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid) # os.killpg(pro.pid, signal.SIGTERM) call(args) # sleep(20) print 'fpgrowth on all data done: {} secs'.format(time()-borgelt_start) freq = Borgelt.read_frequent_items(observed_file_name) # Create ds of all observed triplets # Saved as sorted keys for lookup, # and their frequency as value observed = {} count = 0 for item in freq: if len(item[0]) == 3: sorted_trip = triple_sort(item[0]) observed[sorted_trip] = item[1][0] print 'Total triplets observed:', len(observed) average_observed = sum(observed.values()) / float(len(observed)) print 'Baseline: ', average_observed del freq avg_errors = [] var_errors = [] avg_errors_ext = [] var_errors_ext = [] avg_errors_heu = [] var_errors_heu = [] avg_errors_ind = [] var_errors_ind = [] avg_errors_baseline = [] occurrences = [0 for i in range(100)] max_ent_acc_error = [0 for i in range(100)] ext_acc_error = [0 for i in range(100)] ind_acc_error = [0 for i in range(100)] heu_acc_error = [0 for i in range(100)] baseline_acc_error = [0 for i in range(100)] # Record trip counts for the best estimats max_ent_best = Counter() ext_best = Counter() ind_best = Counter() for index in range(iterations): # Create sample file sampling_start = time() if sample_pct > 0: sample_size= int(total_transactions*sample_pct) else: sample_size = abs(sample_pct) test_data_size = total_transactions - sample_size sample = random.sample(range(total_transactions), sample_size) assert len(sample) == sample_size, 'Sample size not equal to sample' sample.sort() sample_file_name = path + str(index) + '_sample.tab' with open(sample_file_name, 'a') as sample_file: sample_line = 0 for line_num, line in enumerate(open(tab_file, 'rb')): if line_num == sample[sample_line]: sample_file.write(line) sample_line += 1 if sample_line == sample_size: break del sample print 'Sample size: {} time: {}'.format(sample_size, time() - sampling_start) borgelt_start = time() sample_freq_name = path + str(index) + '_sample_frequent_items.out' args = [algorithm, sample_file_name, sample_freq_name, '-s-1', '-n3'] call(args) print 'fpgrowth on sample data done: {} secs'.format(time()-borgelt_start) # Check any frequent items were found if not os.path.exists(sample_freq_name): print 'No frequent items found' print 'args', args continue min_support_trips = min_supported_trips(min_support, test_data_size) print 'Forward min_support_trips set to: ', min_support_trips triangles_start = time() triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples) print 'Found triangles done: {}'.format(time() - triangles_start) #del sample_freq estimates = [] extrapolations = [] independences = [] heurestics = [] baselines = [] observations = [] triplets = [] MAPE_errors = [] MAPE_errors_ext = [] MAPE_errors_ind = [] MAPE_errors_heu = [] MAPE_errors_baseline = [] true_errors = [] pair_triple_ratios = [] triangle_counts = [] # s1_list = [] # s2_list = [] # s3_list = [] # s12_list = [] # s13_list = [] # s23_list = [] # Recursion for estimate to converge req_depth = int(math.log(total_transactions, 2)) + 1 # DFS of the tree holding all triangles for n1 in triangle_tree.keys(): s1, s2_dict = triangle_tree[n1] for n2 in s2_dict.keys(): s2, s12, s3_dict = s2_dict[n2] for n3 in s3_dict.keys(): s3, s13, s23, s123 = s3_dict[n3] triangle_counts.append((s1, s2, s3, s12, s13, s23, s123)) triangle = (n1, n2, n3) pair_triple_ratio = s123 / float(min(s12, s13, s23)) pair_triple_ratios.append(pair_triple_ratio) # Get the obs (test data) frequency minus those found in the sample (training data) obs = 0 if triangle in observed: # (triples in data) - (triples in sample). Calculating the number of triples in test data. obs = observed[triangle] - s123 # maxent estimate est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(sample_size), num=req_depth) * (test_data_size / float(sample_size)) if est < 0: print 'max ent below 0' print 's1 s2 s3 s12 s13 s23 s123', (s1, s2, s3, s12, s23, s13, s123) # extrapolation estimate est2 = s123 / float(sample_size) * test_data_size # independence estimat est3 = (s1 / float(sample_size)) * (s2 / float(sample_size)) * (s3 / float(sample_size)) * test_data_size # est3 = (s1*s2*s3)/float(sample_size*sample_size) * test_data_size/float(sample_size) # heurestic, use max_ent for 0 triple in sample est4 = s123 < 5 and est or est2 # base line estimat est5 = average_observed estimates.append(est) extrapolations.append(est2) independences.append(est3) heurestics.append(est4) baselines.append(est5) observations.append(obs) triplets.append(triangle) # TODO Do why save these? They already exist in the triangle tree (and take # up shit load of space..) # s1_list.append(s1) # s2_list.append(s2) # s3_list.append(s3) # s12_list.append(s12) # s13_list.append(s13) # s23_list.append(s23) #end TODO # MAPE error max ent error = abs(obs-est) / math.sqrt(obs) # * 100 MAPE_errors.append(error) true_errors.append(obs-est) # MAPE error extrapolation error2 = 0 if est2 > 0: error2 = abs(obs-est2) / math.sqrt(obs) # * 100 MAPE_errors_ext.append(error2) # MAPE error independence error3 = abs(obs-est3) / math.sqrt(obs) # * 100 MAPE_errors_ind.append(error3) # MAPE error heurestic error4 = abs(obs-est4) / math.sqrt(obs) # * 100 MAPE_errors_heu.append(error4) # MAPE baseline error error5 = abs(obs-est5) / math.sqrt(obs) #* 100 MAPE_errors_baseline.append(error5) # Record error for the estimeate that performed best if error < error2 and error < error3: max_ent_best[s123] += 1 elif error2 < error and error2 < error3: ext_best[s123] += 1 else: ind_best[s123] += 1 try: occurrences[s123] += 1 max_ent_acc_error[s123] += error ext_acc_error[s123] += error2 ind_acc_error[s123] += error3 heu_acc_error[s123] += error4 baseline_acc_error[s123] += error5 except IndexError, ie: pass # print 'true errors: ', true_errors # print 'estimates: ', estimates # print 'observed: ', observed # print 'mape ', MAPE_errors del triangle_tree del sample_triples if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found min_error = min(MAPE_errors) max_error = max(MAPE_errors) # max ent error avg_error = sum(MAPE_errors) / float(len(MAPE_errors)) avg_errors.append(avg_error) # extrapolation error avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext)) avg_errors_ext.append(avg_error_ext) # independence error avg_error_ind = sum(MAPE_errors_ind) / float(len(MAPE_errors_ind)) avg_errors_ind.append(avg_error_ind) # heurestic error avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu)) avg_errors_heu.append(avg_error_heu) # baseline error avg_error_baseline = sum(MAPE_errors_baseline) / float(len(MAPE_errors_baseline)) avg_errors_baseline.append(avg_error_baseline) var_error = 0 var_error_ext = 0 var_error_heu = 0 var_error_ind = 0 # variance if len(MAPE_errors) > 1: var_error = tvar(MAPE_errors) #tvar is the sample variance var_error_ext = tvar(MAPE_errors_ext) var_error_heu = tvar(MAPE_errors_heu) var_error_ind = tvar(MAPE_errors_ind) # max_ent confidence interval std_dev = math.sqrt(var_error) std_error = std_dev / math.sqrt(sample_size) span_99 = norm.interval(0.99, avg_error, std_error) span_95 = norm.interval(0.95, avg_error, std_error) # ext confidence interval std_dev_ext = math.sqrt(var_error_ext) std_error_ext = std_dev_ext / math.sqrt(sample_size) span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext) span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext) # independence confidence interval std_dev_ind = math.sqrt(var_error_ind) std_error_ind = std_dev_ind / math.sqrt(sample_size) span_99_ind = norm.interval(0.99, avg_error_ind, std_error_ind) span_95_ind = norm.interval(0.95, avg_error_ind, std_error_ind) # heurestic confidence interval std_dev_heu = math.sqrt(var_error_heu) std_error_heu = std_dev_heu / math.sqrt(sample_size) span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu) span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu) var_errors.append(var_error) var_errors_ext.append(var_error_ext) var_errors_heu.append(var_error_heu) var_errors_ind.append(var_error_ind) res_string = "\nResult ({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), total_transactions-sample_size) # log max ent result res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95)) res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext)) res_string += 'avg_error_ind:{} var_error_ind:{}\n'.format(avg_error_ind, var_error_ind) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ind)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ind)) res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu)) res_string += 'avg_error_baseline:{}\n'.format(avg_error_baseline) with open(path + str(index) + '_log.txt', 'a') as log_file: log_file.write(res_string) print res_string # Write result data with open(path + str(index) + '_data.json', 'w') as fd: # triplet_key = ['triple' for t in estimates] # est_key = ['est' for t in estimates] # obs_key = ['obs' for t in observations] fd.write(json.dumps(zip(triplets, zip(estimates, observations)))) with open(path + str(index) + '_data.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_heurestic.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(heurestics): fd.write(str(heurestics[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_independece.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(independences): fd.write(str(independences[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') # Save the errors with open(path + str(index) + '_MAPE_errors.pickle', 'wb') as fd: pickle.dump(MAPE_errors, fd) with open(path + str(index) + '_MAPE_errors_ext.pickle', 'wb') as fd: pickle.dump(MAPE_errors_ext, fd) with open(path + str(index) + '_MAPE_errors_heu.pickle', 'wb') as fd: pickle.dump(MAPE_errors_heu, fd) with open(path + str(index) + '_MAPE_errors_ind.pickle', 'wb') as fd: pickle.dump(MAPE_errors_ind, fd) with open(path + str(index) + '_MAPE_errors_baseline.pickle', 'wb') as fd: pickle.dump(MAPE_errors_baseline, fd) #saves amounts of all subsets of triples. # TODO this code does not run! # with open(path + str(index) + '_data_correlations.tsv', 'w') as fd: # fd.write('s1\ts2\ts3\ts12\ts13\ts23\n') # for _index, i in enumerate(s123): # fd.write(str(s1[_index]) + '\t' + str(s2[_index]) + '\t' + str(s3[_index]) + '\t' + str(s12[_index]) + '\t' + str(s13[_index]) + '\t'+ str(s23[_index]) + '\n') #saves independence estimate for all triples. # TODO Why s123[_index] in the denominator? # TODO What is a 'double independece estimat'? # TODO Why not calculate and save estimates in the same way as ext and max_ent? # with open(path + str(index) + '_independence_estimate.tsv', 'w') as fd: # fd.write('single independence estimate\tdouble independence estimate\n') # for _index, i in enumerate(s123): # tempVal1 = sample_size/(s1[_index]) # tempVal2=sample_size/(s2[_index]) # tempVal3=sample_size/(s3[_index]) # tempVal12=sample_size/(s12[_index]) # tempVal13=sample_size/(s13[_index]) # tempVal23=sample_size/(s23[_index]) # fd.write(str(s123[_index]/tempVal1*tempVal2*tempVal3*(total_transactions-sample_size) + '\t' + s123[_index]/tempVal12*tempVal13*tempVal23*(total_transactions-sample_size) + '\n')) del estimates del observations # remove tmp files # os.remove(sample_freq_name) # os.remove(sample_file_name) else: print 'No abs errors!'
def forward(self, frequent_items): """ Run the forward algorithm for finding triangles. """ keys = [] singletons = {} pairs = {} triples = {} V = {} # build data structures for itemset, (support,) in frequent_items: l = len(itemset) if l == 1: singletons[itemset[0]] = support elif l == 2: a, b = itemset[0], itemset[1] sorted_pair = a < b and itemset or (b, a) pairs[sorted_pair] = support self.__add_to_nodes(V, a, b, keys) elif l == 3: triples[triple_sort(itemset)] = support else: assert False, "frequent itemsets larger than 3 are not used" # print 'Forward space usage in mb:' # print 'keys: ', mem_size.bytes_to_mb(sys.getsizeof(keys)) # print 'singletons: ', mem_size.bytes_to_mb(sys.getsizeof(keys)) # print 'pairs: ', mem_size.bytes_to_mb(sys.getsizeof(keys)) # print 'triples: ', mem_size.bytes_to_mb(sys.getsizeof(keys)) # This is wrong, needs to traverse the graph # print 'V : ', mem_size.bytes_to_mb(sys.getsizeof(V)) # Keys have to be sorted for running Forward keys.sort() res = [] A = {} for key in keys: A[key] = set() for s in keys: adj = V[s] for t in adj: if s < t: for v in A[s]: if v in A[t]: n1, n2, n3 = triple_sort((v, s, t)) s1 = (n1, singletons[n1]) s2 = (n2, singletons[n2]) s3 = (n3, singletons[n3]) s12 = ((n1,n2), pairs[(n1,n2)]) s23 = ((n2,n3), pairs[(n2,n3)]) s13 = ((n1,n3), pairs[(n1,n3)]) c = 0 if triples.has_key((n1,n2,n3)): c = triples[(n1,n2,n3)] s123 = ((n1,n2,n3), c) res.append((s1,s2,s3,s12,s23,s13,s123)) A[t].add(s) # print 'A', mem_size.bytes_to_mb(sys.getsizeof(A)) # print 'res', mem_size.bytes_to_mb(sys.getsizeof(res)) # print 'triangles found: ', len(res) return res, triples
def forward_compact(self, frequent_items_file, min_support, observed, only_interesting_triples, restricted_triples): """ Run the forward algorithm for finding triangles. Found triangles are stored as a (compact) tree """ keys = [] singletons = {} pairs = {} triples = {} V = {} # build data structures for index, line in enumerate(open(frequent_items_file, 'rb')): # if index % 1000000 == 0: # print 'Building ds. lines read: ', index # print 'singletons size: ', mem_size.bytes_to_mb(sys.getsizeof(singletons)) # print 'pairs size: ', mem_size.bytes_to_mb(sys.getsizeof(pairs)) # # print 'triples size: ', mem_size.bytes_to_mb(Forward.triplets_size(triples)) # print 'keys size: ', mem_size.bytes_to_mb(sys.getsizeof(keys)) # Forward.graph_size(V) chunks = line.split() # ex: a b c (42) itemset = tuple(chunks[:-1]) support = int(chunks[-1].replace('(', '').replace(')', '')) # Build graph to be searched for triangles, and save # itemssets to dicts so we can easily look up there support # in forward when triangles are found. l = len(itemset) if l == 1: singletons[itemset[0]] = support elif l == 2: a, b = itemset[0], itemset[1] sorted_pair = a < b and itemset or (b, a) pairs[sorted_pair] = support # Store this support in the graph? self.__add_to_nodes(V, a, b, keys) elif l == 3: a, b, c = triple_sort(itemset) if not a in triples: triples[a] = {b: {c: support}} else: a_dict = triples[a] if not b in a_dict: a_dict[b] = {c: support} else: b_dict = a_dict[b] b_dict[c] = support #triples[] = support else: assert False, "frequent itemsets larger than 3 are not used" # print 'Done building ds' # print 'Forward space usage in mb:' # print 'keys: ', mem_size.bytes_to_mb(sys.getsizeof(keys)) # print 'singletons: ', mem_size.bytes_to_mb(sys.getsizeof(keys)) # print 'pairs: ', mem_size.bytes_to_mb(sys.getsizeof(keys)) # print 'triples: ', mem_size.bytes_to_mb(sys.getsizeof(keys)) # This is wrong, needs to traverse the graph # print 'V : ', mem_size.bytes_to_mb(sys.getsizeof(V)) # Keys have to be sorted for running Forward keys.sort() # print 'keys sorted. keys: ', len(keys) res = {} A = {} for key in keys: A[key] = set() for index, s in enumerate(keys): adj = V[s] # if index % 10000 == 0: # print 'key index: ', index # print 'pct done: ', (index / float(len(keys))) # print 'cache size: ', Forward.graph_size(A) # print 'longest cache list' # max_ = -1 # sum_ = 0 # for l in A: # length = len(l) # if length > max_: # max_ = length # sum_ += length # print max_ # lists = float(len(A)) # print 'avg length: ', (sum_ / lists) # print 'lists', lists for t in adj: # if index > 40000 and index % 500 == 0: # print 'adj len: ', len(adj) if s < t: for v in A[s]: # if index > 40000 and index % 500 == 0 : # print 'A[s] len: ', len(A[s]) if v in A[t]: n1, n2, n3 = triple_sort((v, s, t)) # get triple support triple_support = 0 try: triple_support = triples[n1][n2][n3] except KeyError as ke: pass # check if this is a triple has sufficient support # in the observed data observed_triples = 0 if (n1, n2, n3) in observed: observed_triples = observed[(n1, n2, n3)] - triple_support if observed_triples < min_support: continue if only_interesting_triples and triple_support != 0: continue if not restricted_triples is None and not (n1, n2, n3) in restricted_triples: continue # At this point this is a triangle/triple to be estimate. # Triangles are held in a tree ds, with no root node, # and sorted nodes to reuse singletons and pairs # that occur more than once. if not n1 in res: res[n1] = (singletons[n1], {}) n1_dict = res[n1][1] if not n2 in n1_dict: n1_dict[n2] = (singletons[n2], pairs[(n1,n2)], {}) n2_dict = n1_dict[n2][2] if not n3 in n2_dict: n2_dict[n3] = (singletons[n3], pairs[(n1, n3)], pairs[(n2, n3)], triple_support) else: assert False, 'Triplets can only be found once!' A[t].add(s) # print 'A', mem_size.bytes_to_mb(sys.getsizeof(A)) # This is wrong, needs to traverse the graph # print 'res', mem_size.bytes_to_mb(sys.getsizeof(res)) print 'forward done' t_count = 0 for k in res.keys(): d2 = res[k][1] for k2 in d2.keys(): t_count += len(d2[k2][2].keys()) print 'triangles found :', t_count return res, triples