示例#1
0
def find_optimum(relation, k):

    formulas = get_all_formulas(relation, True)

    logger.info("# formulas: %s", len(formulas))

    logger.debug("possible formulas: %s", relation_rep(formulas))

    all_subsets = list(subsets(formulas, k))

    logger.info("# subsets: %s", len(all_subsets))

    subset_costs = map(lambda x: cost(x, relation), all_subsets)

    ordered = [x for x in sorted(zip(subset_costs, all_subsets), key=lambda x: x[0])]

    best_cost = ordered[0][0]
    best = filter(lambda x: x[0] == best_cost, ordered)

    return best_cost, best
示例#2
0
def find_optimum(relation, k):
    """ Finds the optimum set of formulas and its cost.
    This method generates all formulas, all subsets of formulas and
    then calculates the cost for every one of them. This can be very slow."""

    formulas = get_all_formulas(relation, True)

    logger.info('# formulas: %s', len(formulas))

    logger.debug('possible formulas: %s', relation_rep(formulas))

    all_subsets = list(subsets(formulas, k))

    logger.info('# subsets: %s', len(all_subsets))

    subset_costs = map(lambda x: cost(x, relation), all_subsets)

    ordered = [x for x in sorted(zip(subset_costs, all_subsets), key=lambda x: x[0])]

    best_cost = ordered[0][0]
    best = filter(lambda x: x[0] == best_cost, ordered)

    return best_cost, best
示例#3
0
    def test_repr(self):
        self.assertEqual(map(tuple_rep, read_sample(example)),
                         ['x[0]:a x[0]:b y[0]:c', 'y[0]:c', 'x[0]:a x[1]:d y[0]:c'])

        self.assertEqual(relation_rep(read_sample(example)),
                         'x[0]:a x[0]:b y[0]:c\ny[0]:c\nx[0]:a x[1]:d y[0]:c')
示例#4
0
def find_incremental(relation, k):
    summary = []

    all_cells = []
    for c in get_cells(relation):
        all_cells.append((potential({c}, set(), relation), c))

    all_cells = [x for x in all_cells if x[0] > 0]
    all_cells.sort()
    all_cells.reverse()

    summary = set()
    best_cost = float("inf")

    while True:
        improved_summary = None
        improved_cost = best_cost

        for i, (p, c) in enumerate(all_cells):
            # potential check
            d = best_cost - improved_cost
            if p < d:
                # can't get better any more so let's abort
                break

            is_better = False

            # try to add new formula
            if len(summary) < k:
                s = summary | {frozenset({c})}
                co = cost(s, relation)
                if co < improved_cost:
                    improved_summary = s
                    improved_cost = co
                    is_better = True

            # try to add cell to existing formula
            for f in summary:
                s = summary - {f}
                s.add(f | {c})
                co = cost(s, relation)
                if co < improved_cost:
                    improved_summary = s
                    improved_cost = co
                    is_better = True

            # update potential
            if is_better and best_cost != float("inf"):
                n = best_cost - improved_cost

                if n != potential({c}, summary, relation):
                    print best_cost, improved_cost, n, potential({c}, summary, relation)
                    print relation_rep(summary)
                    print c
                    assert False
            else:
                n = potential({c}, summary, relation)
            print "update from {} to {}".format(p, n)

            all_cells[i] = (n, c)

        # nothing to improve, stop
        if improved_summary is None:
            break

        summary = improved_summary
        best_cost = improved_cost

        # resort cells
        all_cells = [x for x in all_cells if x[0] > 0]
        all_cells.sort()
        all_cells.reverse()

    return best_cost, summary
示例#5
0
        best_cost = improved_cost

        # resort cells
        all_cells = [x for x in all_cells if x[0] > 0]
        all_cells.sort()
        all_cells.reverse()

    return best_cost, summary


if __name__ == '__main__':
    logger.setLevel(logging.INFO)

    relation = read_sample(three_attr_null)

    print relation_rep(relation)

    print
    best_cost, best = find_incremental(relation, 3)

    print('Best cost:', best_cost)
    print(relation_rep(best))

    # speedtest
    logger.setLevel(logging.WARN)

    def run():
        relation = read_sample(larger_example)
        return find_incremental(relation, 5)

    best_cost, best = run()
示例#6
0
    subset_costs = map(lambda x: cost(x, relation), all_subsets)

    ordered = [x for x in sorted(zip(subset_costs, all_subsets), key=lambda x: x[0])]

    best_cost = ordered[0][0]
    best = filter(lambda x: x[0] == best_cost, ordered)

    return best_cost, best


if __name__ == "__main__":
    logger.setLevel(logging.INFO)

    relation = read_sample(three_attr_null)

    print relation_rep(relation)

    best_cost, best = find_optimum(relation, 3)

    print ("Best cost:", best_cost)
    for s in map(relation_rep, [x[1] for x in best]):
        print (s)

    # speedtest
    # logger.setLevel(logging.WARN)

    # def run():
    #     relation = read_sample(larger_example)
    #     return find_optimum(relation, 5)

    # best_cost, best = run()