Exemplo n.º 1
0
  def filter_items(self, M):
    singletons = {}
    pairs = {}
    triples = {}
    G = {} # TODO: look at this graph. does it have long adjacency lists? 

    # build data structures
    for itemset, (support,) in M:
      l = len(itemset)
      if l == 1:
        singletons[itemset[0]] = support
      elif l == 2:
        sorted_pair = itemset[0] < itemset[1] and itemset or (itemset[1], itemset[0])
        pairs[sorted_pair] = support
        self.__add_to_graph(sorted_pair, G)
      elif l == 3:
        triples[utils.triple_sort(itemset)] = support
      else:
        assert False, "frequent itemsets larger than 3 are not used"
    
    # graph_stats(G)

    # print 'singletons found: {}'.format(len(singletons))
    # print 'pairs found: {}'.format(len(pairs))
    # print 'triples found: {}'.format(len(triples))
    res = []

    # find triangles in graph
    for n1 in G.keys():
      # assert singletons.has_key(n1), ("Pair cannot have infrequent singleton item", n1, sigletons)

      for n2 in G[n1]:
        # assert singletons.has_key(n2), ("Pair cannot have infrequent singleton item", n2, sigletons)
        
        # n2 does not necessarily have an adjencency list
        if not G.has_key(n2): 
          continue

        for n3 in G[n2]:
          # assert singletons.has_key(n3), ("Pair cannot have infrequent singleton item", n3, sigletons)
          
          if n3 in G[n1]: # triangle
            
            s1 = (n1, singletons[n1])
            s2 = (n2, singletons[n2])
            s3 = (n3, singletons[n3])
            s12 = ((n1,n2), pairs[(n1,n2)])
            s23 = ((n2,n3), pairs[(n2,n3)])
            s13 = ((n1,n3), pairs[(n1,n3)])

            c = 0
            if triples.has_key((n1,n2,n3)):
              c = triples[(n1,n2,n3)]

            s123 = ((n1,n2,n3), c)

            res.append((s1,s2,s3,s12,s23,s13,s123))

    # print 'Triangles found: {}'.format(len(res))
    return res, triples
Exemplo n.º 2
0
def build_item_search_tree(tsvfile):
    """
    Returns a tree data structure that
    can be used with the look up function.
    """
    s1_pos = s2_pos = s3_pos = est_pos = None
    ds = {}
    for index, line in enumerate(open(tsvfile)):
        line = line.replace('\n', '')
        chunks = line.split('\t')
        if index == 0:
            s1_pos = chunks.index('n1')
            s2_pos = chunks.index('n2')
            s3_pos = chunks.index('n3')
            est_pos = chunks.index('est')
        else:
            s1, s2, s3, est = chunks[s1_pos], chunks[s2_pos], chunks[s3_pos], float(chunks[est_pos])
            # We expect items to be sorted, but just in case
            s1, s2, s3 = triple_sort((s1, s2, s3))
            if (s1, s2) in ds:
                ds[(s1, s2)].append((s3, est))
            else:
                ds[(s1, s2)] = [(s3, est)]
            if (s1, s3) in ds:
                ds[(s1, s3)].append((s2, est))
            else:
                ds[(s1, s3)] = [(s2, est)]
            if (s2, s3) in ds:
                ds[(s2, s3)].append((s1, est))
            else:
                ds[(s2, s3)] = [(s1, est)]
    return ds
    def triple_intervals(self, frequent_items_file, intervals):
        """
        The funktion runs through a frequent items file produced by fp-growth.
        and filter the triplets out and sorts the triplets by frequncy.
        Finaly it devides the frequent itemsets into intervals 
        of the triplets with a specific frequency range.
        """

        triples = []
        frequencies = set()
        for line in open(frequent_items_file, 'rb'):
            chunks = line.split()
            if len(chunks) < 4:
                continue
            itemset = tuple(chunks[:-1])
            support = int(chunks[-1].replace('(','').replace(')',''))
            frequencies.add(support)
            triples.append((itemset, support))
        triples.sort(lambda x,y: x[1]< y[1] and -1 or 1)
        frequencies_sorted = list(frequencies)
        frequencies_sorted.sort()

        # print 'frequencies_sorted: {}'.format(len(frequencies_sorted))
        # print 'triples: ', len(triples)

        result = []
        chunk_intervals = [] 
        interval = len(frequencies_sorted) / intervals
        triple_index = 0
        for c in xrange(0, len(frequencies_sorted), interval):
            triple_set = {} # subset of triples, with given frequency
            chunk = frequencies_sorted[c:c+interval] # Chunk of frequencies
            chunk_intervals.append((chunk[0], chunk[-1]))
            for freq in chunk:
                triple, support = triples[triple_index]
                while support == freq:
                    triple_set[triple_sort(triple)] = support
                    triple_index += 1
                    if triple_index < len(triples):
                        triple, support = triples[triple_index]
                    else:
                        break
            result.append(triple_set)

        set_lengths = [len(l) for l in result]
        print "Set sizes: {}".format(set_lengths)
        print "Intervals: {}".format(chunk_intervals)
        return result
Exemplo n.º 4
0
def est_all_data_disc_version(algorithm, tab_file, min_support=-30, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''):
    from subprocess import call
    from parsers import Borgelt

    cv_start = time()

    # Create work folder
    _id = str(time()).replace('.','') + '_' + extra_id
    path = '../tmp/cv_' + _id + '/'
    os.mkdir(path)
    print "\n### Running cross validation on ALL DATA cv_{}###".format(_id)

    total_transactions = 0
    for line in open(tab_file, 'rb'):
        total_transactions += 1
    print 'Total total_transactions: ', total_transactions
    sample_size = total_transactions

    avg_errors = []
    var_errors = []
    avg_errors_ext = []
    var_errors_ext = []
    avg_errors_heu = []
    var_errors_heu = []
    for index in range(iterations):

        borgelt_start = time()
        sample_freq_name = path + str(index) + '_sample_frequent_items.out'
        args = [algorithm, tab_file, sample_freq_name, '-s' + str(min_support), '-n3']
        call(args)
        print 'fpgrowth on sample data (ALL DATA) done: {} secs'.format(time()-borgelt_start)


        freq = Borgelt.read_frequent_items(sample_freq_name)
        # Create ds of all observed triplets
        # Saved as sorted keys for lookup,
        # and their frequency as value
        observed = {}
        count = 0
        for item in freq:
            if len(item[0]) == 3:
                sorted_trip = triple_sort(item[0])
                # * 2, horrible hack to make Forward calculated the 
                # observed frequency correctly.
                observed[sorted_trip] = item[1][0] * 2
        print 'Total triplets observed:', len(observed)

        # Check any frequent items were found
        if not os.path.exists(sample_freq_name):
            print 'No frequent items found'
            print 'args', args
            continue

        min_support_trips = min_supported_trips(min_support, total_transactions)
        print 'Forward min_support_trips set to: ', min_support_trips
        triangles_start = time()
        triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples)
        print 'Found triangles done: {}'.format(time() - triangles_start)

        #del sample_freq

        estimates = []
        extrapolations = []
        heurestics = []
        observations = []
        triplets = []
        MAPE_errors = []
        MAPE_errors_ext = []
        triangle_counts = []
        triplets = []
        pair_triple_ratios = []

        # Recursion for estimate to converge
        req_depth = int(math.log(total_transactions, 2))+1

        # DFS of the tree holding all triangles
        for n1 in triangle_tree.keys():
            s1, s2_dict = triangle_tree[n1]
            for n2 in s2_dict.keys():
                s2, s12, s3_dict = s2_dict[n2]                                                                                                                                                                                                                          
                for n3 in s3_dict.keys():                                                                                                                                       
                    s3, s13, s23, s123 = s3_dict[n3]

                    triangle = (n1, n2, n3)  
                    triplets.append(triangle)

                    triangle_counts.append((s1, s2, s3, s12, s13, s23, s123))   

                    pair_triple_ratio = s123 / float(min(s12, s13, s23))
                    pair_triple_ratios.append(pair_triple_ratio)                                                                                                                                                                                                                                                                                                                                                                                                                                   

                    # Observed is the triple support, since sample is all data
                    obs = s123

                    # maxent estimate
                    est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(total_transactions), num=req_depth)

                    # extrapolation estimate, does not make sense for all data
                    est2 = s123 / float(sample_size) * (total_transactions)

                    # heurestic, use max_ent for 0 triple in sample, does not make sense for all data
                    # est3 = s123 == 0 and est or est2

                    estimates.append(est)
                    # extrapolations.append(est2)
                    # heurestics.append(est3)
                    observations.append(obs)
                    triplets.append(triangle)

                    # MAPE error max ent
                    error = abs(obs-est) / math.sqrt(obs)
                    MAPE_errors.append(error)
                    # MAPE error extrapolation
                    error2 = abs(obs-est2) / math.sqrt(obs)
                    MAPE_errors_ext.append(error2)
                    # MAPE error heurestic
                    # error3 = abs(obs-est3) / float(obs) * 100
                    # MAPE_errors_heu.append(error3)

        
        del triangle_tree
        del sample_triples
                    
        if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found

            min_error = min(MAPE_errors)
            max_error = max(MAPE_errors)

            # max ent error
            avg_error = sum(MAPE_errors) / float(len(MAPE_errors))
            avg_errors.append(avg_error)

            # extrapolation error
            # avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext))
            # avg_errors_ext.append(avg_error_ext)
            
            # heurestic error
            # avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu))
            # avg_errors_heu.append(avg_error_heu)
            
            # variance
            var_error = var(MAPE_errors)
            # var_error_ext = tvar(MAPE_errors_ext)
            # var_error_heu = tvar(MAPE_errors_heu)

            # max_ent confidence interval
            std_dev = math.sqrt(var_error)
            std_error = std_dev / math.sqrt(sample_size)
            span_99 = norm.interval(0.99, avg_error, std_error)
            span_95 = norm.interval(0.95, avg_error, std_error)

            # ext confidence interval
            # std_dev_ext = math.sqrt(var_error_ext)
            # std_error_ext = std_dev_ext / math.sqrt(sample_size)
            # span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext)
            # span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext)

            # heurestic confidence interval
            # std_dev_heu = math.sqrt(var_error_heu)
            # std_error_heu = std_dev_heu / math.sqrt(sample_size)
            # span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu)
            # span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu)

            var_errors.append(var_error)
            # var_errors_ext.append(var_error_ext)
            # var_errors_heu.append(var_error_heu)
            
            res_string = "\nResult ALL DATA({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), sample_size)
            # log max ent result
            res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95))

            res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext))

            # res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu)
            # res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu))
            # res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu))

            with open(path + 'log.txt', 'a') as log_file:
                log_file.write(res_string)
            print res_string

            # Write result data
            with open(path + str(index) + '_data.json', 'w') as fd:
                # triplet_key = ['triple' for t in estimates]
                # est_key = ['est' for t in estimates]
                # obs_key = ['obs' for t in observations]
                fd.write(json.dumps(zip(triplets, zip(estimates, observations))))
            with open(path + str(index) + '_data.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            del estimates
            del observations

            # remove tmp files
            # os.remove(sample_freq_name)
            # os.remove(sample_file_name)

        else:
            print 'No abs errors!'

    print "Cross validation done!"
    print "time: ", (time() - cv_start)
    if len(avg_errors) > 0:
        total_avg_error = sum(avg_errors)/float(len(avg_errors))
        total_res_string = "Avg error:{}".format(total_avg_error)
Exemplo n.º 5
0
def cross_validate_disc_version(algorithm, tab_file, min_support=-30, sample_pct=0.1, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''):
    from subprocess import call
    from parsers import Borgelt

    cv_start = time()

    # Create work folder
    _id = str(time()).replace('.','') + '_' + extra_id
    path = '../tmp/cv_' + _id + '/'
    os.mkdir(path)
    print "\n### Running cross validation cv_{}###".format(_id)

    total_transactions = 0
    for line in open(tab_file, 'rb'):
        total_transactions += 1
    print 'Total total_transactions: ', total_transactions

    # Get the total observed triples
    borgelt_start = time()
    observed_file_name = path + 'observed_frequent_items.out'
    args = [algorithm, tab_file, observed_file_name, '-s' + str(min_support), '-n3']
    # pro = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid)
    # os.killpg(pro.pid, signal.SIGTERM)
    call(args)
    # sleep(20)
    print 'fpgrowth on all data done: {} secs'.format(time()-borgelt_start)

    freq = Borgelt.read_frequent_items(observed_file_name)

    # Create ds of all observed triplets
    # Saved as sorted keys for lookup,
    # and their frequency as value
    observed = {}
    count = 0
    for item in freq:
        if len(item[0]) == 3:
            sorted_trip = triple_sort(item[0])
            observed[sorted_trip] = item[1][0]
    print 'Total triplets observed:', len(observed)
    average_observed = sum(observed.values()) / float(len(observed))
    print 'Baseline: ', average_observed

    del freq

    avg_errors = []
    var_errors = []
    avg_errors_ext = []
    var_errors_ext = []
    avg_errors_heu = []
    var_errors_heu = []
    avg_errors_ind = []
    var_errors_ind = []
    avg_errors_baseline = []

    occurrences = [0 for i in range(100)]
    max_ent_acc_error = [0 for i in range(100)]
    ext_acc_error = [0 for i in range(100)]
    ind_acc_error = [0 for i in range(100)]
    heu_acc_error = [0 for i in range(100)]
    baseline_acc_error = [0 for i in range(100)]

    # Record trip counts for the best estimats
    max_ent_best = Counter()
    ext_best = Counter()
    ind_best = Counter()

    for index in range(iterations):

        # Create sample file
        sampling_start = time()
        if sample_pct > 0:
            sample_size= int(total_transactions*sample_pct)
        else:
            sample_size = abs(sample_pct)
        test_data_size = total_transactions - sample_size
        sample = random.sample(range(total_transactions), sample_size)
        assert len(sample) == sample_size, 'Sample size not equal to sample'
        sample.sort()
        sample_file_name = path + str(index) + '_sample.tab'
        with open(sample_file_name, 'a') as sample_file:
            sample_line = 0
            for line_num, line in enumerate(open(tab_file, 'rb')):
                if line_num == sample[sample_line]:
                    sample_file.write(line)
                    sample_line += 1
                    if sample_line == sample_size:
                        break

        del sample
        print 'Sample size: {} time: {}'.format(sample_size, time() - sampling_start)
        borgelt_start = time()
        sample_freq_name = path + str(index) + '_sample_frequent_items.out'
        args = [algorithm, sample_file_name, sample_freq_name, '-s-1', '-n3']
        call(args)
        print 'fpgrowth on sample data done: {} secs'.format(time()-borgelt_start)

        # Check any frequent items were found
        if not os.path.exists(sample_freq_name):
            print 'No frequent items found'
            print 'args', args
            continue

        min_support_trips = min_supported_trips(min_support, test_data_size)
        print 'Forward min_support_trips set to: ', min_support_trips
        triangles_start = time()
        triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples)
        print 'Found triangles done: {}'.format(time() - triangles_start)

        #del sample_freq

        estimates = []
        extrapolations = []
        independences = []
        heurestics = []
        baselines = []
        observations = []

        triplets = []
        MAPE_errors = []
        MAPE_errors_ext = []
        MAPE_errors_ind = []
        MAPE_errors_heu = []
        MAPE_errors_baseline = []
        true_errors = []
        pair_triple_ratios = []

        triangle_counts = []

        # s1_list = []
        # s2_list = []
        # s3_list = []
        # s12_list = []
        # s13_list = []
        # s23_list = []

        # Recursion for estimate to converge
        req_depth = int(math.log(total_transactions, 2)) + 1

        # DFS of the tree holding all triangles
        for n1 in triangle_tree.keys():
            s1, s2_dict = triangle_tree[n1]
            for n2 in s2_dict.keys():
                s2, s12, s3_dict = s2_dict[n2]
                for n3 in s3_dict.keys():
                    s3, s13, s23, s123 = s3_dict[n3]

                    triangle_counts.append((s1, s2, s3, s12, s13, s23, s123))

                    triangle = (n1, n2, n3)

                    pair_triple_ratio = s123 / float(min(s12, s13, s23))
                    pair_triple_ratios.append(pair_triple_ratio)

                    # Get the obs (test data) frequency minus those found in the sample (training data)
                    obs = 0
                    if triangle in observed:
                         # (triples in data) - (triples in sample). Calculating the number of triples in test data.
                        obs = observed[triangle] - s123

                    # maxent estimate
                    est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(sample_size), num=req_depth) * (test_data_size / float(sample_size))

                    if est < 0:
                        print 'max ent below 0'
                        print 's1 s2 s3 s12 s13 s23 s123', (s1, s2, s3, s12, s23, s13, s123)

                    # extrapolation estimate
                    est2 = s123 / float(sample_size) * test_data_size

                    # independence estimat
                    est3 = (s1 / float(sample_size)) * (s2 / float(sample_size)) * (s3 / float(sample_size)) * test_data_size
                    # est3 = (s1*s2*s3)/float(sample_size*sample_size) * test_data_size/float(sample_size)

                    # heurestic, use max_ent for 0 triple in sample
                    est4 = s123 < 5 and est or est2

                    # base line estimat
                    est5 = average_observed

                    estimates.append(est)
                    extrapolations.append(est2)
                    independences.append(est3)
                    heurestics.append(est4)
                    baselines.append(est5)
                    observations.append(obs)
                    triplets.append(triangle)
                    # TODO Do why save these? They already exist in the triangle tree (and take
                    # up shit load of space..)
                    # s1_list.append(s1)
                    # s2_list.append(s2)
                    # s3_list.append(s3)
                    # s12_list.append(s12)
                    # s13_list.append(s13)
                    # s23_list.append(s23)
                    #end TODO

                    # MAPE error max ent
                    error = abs(obs-est) / math.sqrt(obs) # * 100
                    MAPE_errors.append(error)
                    true_errors.append(obs-est)

                    # MAPE error extrapolation
                    error2 = 0
                    if est2 > 0:
                        error2 = abs(obs-est2) / math.sqrt(obs) # * 100
                    MAPE_errors_ext.append(error2)

                    # MAPE error independence
                    error3 = abs(obs-est3) / math.sqrt(obs) # * 100
                    MAPE_errors_ind.append(error3)

                    # MAPE error heurestic
                    error4 = abs(obs-est4) / math.sqrt(obs) # * 100
                    MAPE_errors_heu.append(error4)

                    # MAPE baseline error
                    error5 = abs(obs-est5) / math.sqrt(obs) #* 100
                    MAPE_errors_baseline.append(error5)

                    # Record error for the estimeate that performed best
                    if error < error2 and error < error3:
                        max_ent_best[s123] += 1
                    elif error2 < error and error2 < error3:
                        ext_best[s123] += 1
                    else:
                        ind_best[s123] += 1

                    try:
                        occurrences[s123] += 1
                        max_ent_acc_error[s123] += error
                        ext_acc_error[s123] += error2
                        ind_acc_error[s123] += error3
                        heu_acc_error[s123] += error4
                        baseline_acc_error[s123] += error5
                    except IndexError, ie:
                        pass


        # print 'true errors: ', true_errors
        # print 'estimates: ', estimates
        # print 'observed: ', observed
        # print 'mape ', MAPE_errors
        del triangle_tree
        del sample_triples

        if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found

            min_error = min(MAPE_errors)
            max_error = max(MAPE_errors)

            # max ent error
            avg_error = sum(MAPE_errors) / float(len(MAPE_errors))
            avg_errors.append(avg_error)

            # extrapolation error
            avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext))
            avg_errors_ext.append(avg_error_ext)

            # independence error
            avg_error_ind = sum(MAPE_errors_ind) / float(len(MAPE_errors_ind))
            avg_errors_ind.append(avg_error_ind)

            # heurestic error
            avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu))
            avg_errors_heu.append(avg_error_heu)

            # baseline error
            avg_error_baseline = sum(MAPE_errors_baseline) / float(len(MAPE_errors_baseline))
            avg_errors_baseline.append(avg_error_baseline)

            var_error = 0
            var_error_ext = 0
            var_error_heu = 0
            var_error_ind = 0
            # variance
            if len(MAPE_errors) > 1:
                var_error = tvar(MAPE_errors) #tvar is the sample variance
                var_error_ext = tvar(MAPE_errors_ext)
                var_error_heu = tvar(MAPE_errors_heu)
                var_error_ind = tvar(MAPE_errors_ind)


            # max_ent confidence interval
            std_dev = math.sqrt(var_error)
            std_error = std_dev / math.sqrt(sample_size)
            span_99 = norm.interval(0.99, avg_error, std_error)
            span_95 = norm.interval(0.95, avg_error, std_error)

            # ext confidence interval
            std_dev_ext = math.sqrt(var_error_ext)
            std_error_ext = std_dev_ext / math.sqrt(sample_size)
            span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext)
            span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext)

            # independence confidence interval
            std_dev_ind = math.sqrt(var_error_ind)
            std_error_ind = std_dev_ind / math.sqrt(sample_size)
            span_99_ind = norm.interval(0.99, avg_error_ind, std_error_ind)
            span_95_ind = norm.interval(0.95, avg_error_ind, std_error_ind)

            # heurestic confidence interval
            std_dev_heu = math.sqrt(var_error_heu)
            std_error_heu = std_dev_heu / math.sqrt(sample_size)
            span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu)
            span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu)

            var_errors.append(var_error)
            var_errors_ext.append(var_error_ext)
            var_errors_heu.append(var_error_heu)
            var_errors_ind.append(var_error_ind)

            res_string = "\nResult ({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), total_transactions-sample_size)
            # log max ent result
            res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95))

            res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext))

            res_string += 'avg_error_ind:{} var_error_ind:{}\n'.format(avg_error_ind, var_error_ind)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ind))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ind))

            res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu))

            res_string += 'avg_error_baseline:{}\n'.format(avg_error_baseline)

            with open(path + str(index) + '_log.txt', 'a') as log_file:
                log_file.write(res_string)
            print res_string

            # Write result data
            with open(path + str(index) + '_data.json', 'w') as fd:
                # triplet_key = ['triple' for t in estimates]
                # est_key = ['est' for t in estimates]
                # obs_key = ['obs' for t in observations]
                fd.write(json.dumps(zip(triplets, zip(estimates, observations))))
            with open(path + str(index) + '_data.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_heurestic.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(heurestics):
                    fd.write(str(heurestics[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_independece.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(independences):
                    fd.write(str(independences[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')

            # Save the errors
            with open(path + str(index) + '_MAPE_errors.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors, fd)
            with open(path + str(index) + '_MAPE_errors_ext.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_ext, fd)
            with open(path + str(index) + '_MAPE_errors_heu.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_heu, fd)
            with open(path + str(index) + '_MAPE_errors_ind.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_ind, fd)
            with open(path + str(index) + '_MAPE_errors_baseline.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_baseline, fd)

            #saves amounts of all subsets of triples.
            # TODO this code does not run!
            # with open(path + str(index) + '_data_correlations.tsv', 'w') as fd:
            #     fd.write('s1\ts2\ts3\ts12\ts13\ts23\n')
            #     for _index, i in enumerate(s123):
            #         fd.write(str(s1[_index]) + '\t' + str(s2[_index]) + '\t' + str(s3[_index]) + '\t' + str(s12[_index]) + '\t' + str(s13[_index]) + '\t'+ str(s23[_index]) + '\n')

            #saves independence estimate for all triples.
            # TODO Why s123[_index] in the denominator?
            # TODO What is a 'double independece estimat'?
            # TODO Why not calculate and save estimates in the same way as ext and max_ent?
            # with open(path + str(index) + '_independence_estimate.tsv', 'w') as fd:
            #     fd.write('single independence estimate\tdouble independence estimate\n')
            #     for _index, i in enumerate(s123):
            #     	tempVal1 = sample_size/(s1[_index])
            #     	tempVal2=sample_size/(s2[_index])
            #     	tempVal3=sample_size/(s3[_index])
            #     	tempVal12=sample_size/(s12[_index])
            #     	tempVal13=sample_size/(s13[_index])
            #     	tempVal23=sample_size/(s23[_index])
            #         fd.write(str(s123[_index]/tempVal1*tempVal2*tempVal3*(total_transactions-sample_size) + '\t' + s123[_index]/tempVal12*tempVal13*tempVal23*(total_transactions-sample_size) + '\n'))


            del estimates
            del observations

            # remove tmp files
            # os.remove(sample_freq_name)
            # os.remove(sample_file_name)

        else:
            print 'No abs errors!'
Exemplo n.º 6
0
    def forward(self, frequent_items):
        """
        Run the forward algorithm for finding
        triangles.
        """
        keys = []
        singletons = {}
        pairs = {}
        triples = {}
        V = {}

        # build data structures
        for itemset, (support,) in frequent_items:
            l = len(itemset)
            if l == 1:
                singletons[itemset[0]] = support
            elif l == 2:
                a, b = itemset[0], itemset[1]
                sorted_pair = a < b and itemset or (b, a)
                pairs[sorted_pair] = support
                self.__add_to_nodes(V, a, b, keys)
            elif l == 3:
                triples[triple_sort(itemset)] = support
            else:
                assert False, "frequent itemsets larger than 3 are not used"

        # print 'Forward space usage in mb:'
        # print 'keys: ', mem_size.bytes_to_mb(sys.getsizeof(keys))
        # print 'singletons: ', mem_size.bytes_to_mb(sys.getsizeof(keys))
        # print 'pairs: ', mem_size.bytes_to_mb(sys.getsizeof(keys))
        # print 'triples: ', mem_size.bytes_to_mb(sys.getsizeof(keys))
        # This is wrong, needs to traverse the graph
        # print 'V : ', mem_size.bytes_to_mb(sys.getsizeof(V))


        # Keys have to be sorted for running Forward
        keys.sort()

        res = []
        A = {}
        for key in keys:
            A[key] = set()
        for s in keys:
            adj = V[s]
            for t in adj:
                if s < t:
                    for v in A[s]:
                        if v in A[t]:
                            n1, n2, n3 = triple_sort((v, s, t))
                            s1 = (n1, singletons[n1])
                            s2 = (n2, singletons[n2])
                            s3 = (n3, singletons[n3])
                            s12 = ((n1,n2), pairs[(n1,n2)])
                            s23 = ((n2,n3), pairs[(n2,n3)])
                            s13 = ((n1,n3), pairs[(n1,n3)])

                            c = 0
                            if triples.has_key((n1,n2,n3)):
                                c = triples[(n1,n2,n3)]

                            s123 = ((n1,n2,n3), c)

                            res.append((s1,s2,s3,s12,s23,s13,s123))

                    A[t].add(s)
        # print 'A', mem_size.bytes_to_mb(sys.getsizeof(A))
        # print 'res', mem_size.bytes_to_mb(sys.getsizeof(res))
        # print 'triangles found: ', len(res)

        return res, triples
Exemplo n.º 7
0
    def forward_compact(self, frequent_items_file, min_support, observed, only_interesting_triples, restricted_triples):
        """
        Run the forward algorithm for finding
        triangles.
        Found triangles are stored as a (compact) tree 
        """
        keys = []
        singletons = {}
        pairs = {}
        triples = {}
        V = {}

        # build data structures
        for index, line in enumerate(open(frequent_items_file, 'rb')):
            # if index % 1000000 == 0:
            #     print 'Building ds. lines read: ', index
            #     print 'singletons size: ', mem_size.bytes_to_mb(sys.getsizeof(singletons))
            #     print 'pairs size: ', mem_size.bytes_to_mb(sys.getsizeof(pairs))
            #    # print 'triples size: ', mem_size.bytes_to_mb(Forward.triplets_size(triples))
            #     print 'keys size: ', mem_size.bytes_to_mb(sys.getsizeof(keys))
            #     Forward.graph_size(V)

            chunks = line.split() # ex: a b c (42)
            itemset = tuple(chunks[:-1])
            support = int(chunks[-1].replace('(', '').replace(')', ''))

            # Build graph to be searched for triangles, and save
            # itemssets to dicts so we can easily look up there support
            # in forward when triangles are found.
            l = len(itemset)
            if l == 1:
                singletons[itemset[0]] = support
            elif l == 2:
                a, b = itemset[0], itemset[1]
                sorted_pair = a < b and itemset or (b, a)
                pairs[sorted_pair] = support # Store this support in the graph?
                self.__add_to_nodes(V, a, b, keys)
            elif l == 3:
                a, b, c = triple_sort(itemset)
                if not a in triples:
                    triples[a] = {b: {c: support}}
                else:
                    a_dict = triples[a]        
                    if not b in a_dict:
                        a_dict[b] = {c: support}
                    else:
                        b_dict = a_dict[b]
                        b_dict[c] = support
                #triples[] = support
            else:
                assert False, "frequent itemsets larger than 3 are not used"
        # print 'Done building ds'
        # print 'Forward space usage in mb:'
        # print 'keys: ', mem_size.bytes_to_mb(sys.getsizeof(keys))
        # print 'singletons: ', mem_size.bytes_to_mb(sys.getsizeof(keys))
        # print 'pairs: ', mem_size.bytes_to_mb(sys.getsizeof(keys))
        # print 'triples: ', mem_size.bytes_to_mb(sys.getsizeof(keys))
        # This is wrong, needs to traverse the graph
        # print 'V : ', mem_size.bytes_to_mb(sys.getsizeof(V))


        # Keys have to be sorted for running Forward
        keys.sort()
        # print 'keys sorted. keys: ', len(keys)

        res = {}
        A = {}
        for key in keys:
            A[key] = set()
        for index, s in enumerate(keys):
            adj = V[s]
            # if index % 10000 == 0:
            #     print 'key index: ', index
            #     print 'pct done: ', (index / float(len(keys)))
            #     print 'cache size: ', Forward.graph_size(A)
            #     print 'longest cache list'
            #     max_ = -1
            #     sum_ = 0
            #     for l in A: 
            #         length = len(l)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
            #         if length > max_:
            #             max_ = length
            #         sum_ += length                                                                                                                                                                                                                                                                                                                                                              
            #     print max_
            #     lists = float(len(A))
            #     print 'avg length: ', (sum_ / lists)
            #     print 'lists', lists
            for t in adj:
                # if index > 40000 and index % 500 == 0:
                #     print 'adj len: ', len(adj)
                if s < t:
                    for v in A[s]:
                        # if index > 40000 and index % 500 == 0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       :
                        #     print 'A[s] len: ', len(A[s])
                        if v in A[t]:
                            n1, n2, n3 = triple_sort((v, s, t))

                            # get triple support
                            triple_support = 0
                            try:
                                triple_support = triples[n1][n2][n3]
                            except KeyError as ke:
                                pass

                            # check if this is a triple has sufficient support
                            # in the observed data
                            observed_triples = 0
                            if (n1, n2, n3) in observed:
                                observed_triples = observed[(n1, n2, n3)] - triple_support
                            if observed_triples < min_support:
                                continue

                            if only_interesting_triples and triple_support != 0:
                                continue

                            if not restricted_triples is None and not (n1, n2, n3) in restricted_triples:
                                continue

                            # At this point this is a triangle/triple to be estimate.
                            # Triangles are held in a tree ds, with no root node,
                            # and sorted nodes to reuse singletons and pairs
                            # that occur more than once.
                            if not n1 in res:
                                res[n1] = (singletons[n1], {})

                            n1_dict = res[n1][1]
                            
                            if not n2 in n1_dict:
                                n1_dict[n2] = (singletons[n2], pairs[(n1,n2)], {})

                            n2_dict = n1_dict[n2][2]
                            
                            if not n3 in n2_dict:
                                n2_dict[n3] = (singletons[n3], pairs[(n1, n3)], pairs[(n2, n3)], triple_support)
                            else:
                                assert False, 'Triplets can only be found once!'

                    A[t].add(s)
        # print 'A', mem_size.bytes_to_mb(sys.getsizeof(A))
        # This is wrong, needs to traverse the graph
        # print 'res', mem_size.bytes_to_mb(sys.getsizeof(res))
        print 'forward done'
        t_count = 0
        for k in res.keys():
           d2 = res[k][1]
           for k2 in d2.keys():
               t_count += len(d2[k2][2].keys())
        print 'triangles found :', t_count
        return res, triples