def attribute_exploration_pps(tuples): U = range(len(tuples[0])) # Attributes n_atts = len(U) m_prime = [set([]) for i in range(len(U))] g_prime = [] stats = Stats() fctx = FormalContext(g_prime, m_prime) m_prime.append(set( [])) # THIS SHOULD BE AFTER DECLARING THE FORMAL CONTEXT print("Processing data... ", end='') sys.stdout.flush() representations = [[row[j] for row in tuples] for j in U] print("done") # ATTRIBUTE ORDERING print("Building representations... ", end='') sys.stdout.flush() plis = [(build_pli(r), ri) for ri, r in enumerate(representations)] print("done") print("Ordering... ", end='') sys.stdout.flush() # ATTRIBUTE ORDERING # ex_order = [290, 17, 7, 7, 489, 14, 10, 31, 509, 6, 341, 151, 16, 28, 49, 4, 1, 19, 571, 810, 6, 8, 17] # plis.sort(key=lambda k: ex_order[k[1]], reverse=False) # Lexicographic plis.sort(key=lambda k: k[0], reverse=False) # Lexicographic order = {j[1]: i for i, j in enumerate(plis)} #Original order -> new order inv_order = {i: j for j, i in order.items() } # At position i should be attribute j # print(order) # print(inv_order) # exit() # reco_order = { } plis = [ i[0] for i in plis ] # build_pli(representations[ inv_order[i] ]) for i in range(n_atts) ] print("done") print("Reconverting... ", end='') tuples = [[None] * n_atts for i in range(len(tuples))] # print(plis) # not_none = [0 for i in range(len(tuples))] for att in range(n_atts): att = inv_order[att] for i, cluster in enumerate(plis[att]): for row in cluster: tuples[row][att] = i print("done") # print(records) # print(tuples) # for ti, t in enumerate(tuples): # tuples[ti] = [t[inv_order[i]] if any(ti in part for part in plis[i]) else None for i in range(len(t))] # tuples[ti] = [t[inv_order[i]] for i in range(len(t))] # print(tuples[ti], ti, ) # print (tuples) # tuples=records # records[] # print(plis) # # END ORDERING # VARIABLES FOR FAST STACKED NEXT CLOSURE Mjs = [set() for i in range(n_atts)] stack = [[None, m_prime[-1]], [None, set([]), Mjs]] # INITIALIZATION VARIABLES X = set([]) fdt = FDTree(U) m_i = -1 # WE START WITH THE EMPTY INTENT REPRESENTED BY THIS # COUNTERS TO KEEP SOME PERFORMANCE STATISTICS cycles = 0 cycles2 = 0 avoided_closures = 0 ncls = 0 while X != U: cycles += 1 if cycles % 1000 == 0: print("\rFDs:{}/{}/{}/{}/{} - {: <100}".format( fdt.n_fds, cycles, cycles2, len(g_prime), round((sum([len(mp) for mp in m_prime])) / len(m_prime)), ','.join(map(str, sorted(X)))), end='') #stack sys.stdout.flush() XJ = stack[-2][1].intersection(m_prime[m_i]) if bool(XJ): # XJJ = reduce(set.intersection, (g_prime[g] for g in XJ)) XJJ = set.intersection(*[g_prime[g] for g in XJ]) # if len(XJ) == 1: # XJJ = set(XJJ) else: XJJ = set(U) # AT THIS POINT WE HAVE XJJ WHICH IS OUR ESTIMATION OF THE CLOSURE # USING THE REPRESENTATION CONTEXT CALCULATED SO FAR # THE ACTUAL CLOSURE SHOULD BE XSS, HOWEVER IF # X = XJJ WE KNOW THAT XSS = XJJ AND WE CAN AVOID ITS # CALCULATION # XSS = None n_x = len(X) avoided_closures += n_x == len(XJJ) if n_x < len(XJJ): # CHECKS WHETHER X==XJJ cycles2 += 1 cache = [] check(X, XJJ, tuples, n_atts, cache, plis, stats) if n_x < len(XJJ): fdt.add_fd(X, XJJ) # break else: cache.sort(key=len) gp = cache.pop() n_gp = len(g_prime) XJ.add(n_gp) for i in stack[1:]: i[1].add(n_gp) for x in gp: m_prime[x].add(n_gp) g_prime.append(gp) # XJJ.intersection_update(gp) new_atts = XJJ - X if not bool(new_atts) or m_i <= min(new_atts): m_i = U[-1] X = XJJ else: # print(stack) stack[-2][2][m_i] = XJJ # print('\t',m_i, XJJ) X.difference_update([m for m in X if m > m_i]) stack[-1][1] = XJ X, m_i = fast_next_closure(X, U, fdt.l_close, m_i, stats, stack) # ncls += c stack[-1][0] = m_i # print ('--') # for g in g_prime: # print (g) L = list(fdt.read_fds()) print("\nNUMBER OF FDS:{}".format(len(L))) print("SAMPLING CONTEXT SIZE:{}".format(len(g_prime))) print("CYCLES:", cycles) print("DB CHECKS:", cycles2) print("GOOD CLOSURES:", avoided_closures) print("Closures:", stats.closures) print("Failures:", stats.failures) print("Row check:", stats.row_check) print("Conflicting Attributes:", [stats.conflicting_attributes[order[i]] for i in range(n_atts)]) print("Non Conflicting Attributes:", [stats.non_conflicting[order[i]] for i in range(n_atts)]) # print("EFF:", [abs(stats.conflicting_attributes[order[i]]-stats.non_conflicting[order[i]]) for i in range(n_atts)]) print(order)
def attribute_exploration_pps(tuples): alg = AddIntentAlgorithm() U = range(len(tuples[0])) # Attributes n_atts = len(U) m_prime = [set([]) for i in range(len(U))] g_prime = [] rand_tuples = list(range(len(tuples))) # random.shuffle(rand_tuples) rand_tuples.sort(key=lambda i: len(set(tuples[i]))) fctx = FormalContext(g_prime, m_prime) sampled_tuples = [] representations = [[row[j] for row in tuples] for j in U] # ATTRIBUTE ORDERING order = [(len(set(r)), ri) for ri, r in enumerate(representations)] order.sort(key=lambda k: k[0], reverse=False) # print (order) order = {j[1]: i for i, j in enumerate(order)} #Original order -> new order inv_order = {i: j for j, i in order.items()} for ti, t in enumerate(tuples): tuples[ti] = [t[inv_order[i]] for i in range(len(t))] # # END ORDERING Mjs = [set() for i in range(n_atts)] stack = [[None, None], [None, set([]), Mjs]] X = set([]) fdt = FDTree(U) m_i = -1 cycles = 0 cycles2 = 0 XJ = set([]) XJJ = fctx.closed_set(X) avoided_closures = 0 ncls = 0 g_sub = [] while X != U: cycles += 1 if cycles % 1000 == 0: print("\rFDs:{}/{}/{}/{}/{}/{}/{} - {: <100}".format( fdt.n_fds, cycles, cycles2, len(g_prime), len(alg.jip_objects), len(alg.elements), (sum([len(mp) for mp in m_prime])) / len(m_prime), ','.join(map(str, sorted(X)))), end='') #stack sys.stdout.flush() if m_i >= 0: XJ = stack[-2][1].intersection(m_prime[m_i]) if bool(XJ): XJJ = reduce(set.intersection, (g_prime[g] for g in XJ)) if len(XJ) == 1: XJJ = set(XJJ) else: XJJ = set(range(len(m_prime))) # cache = {} XSS = None n_x = len(X) avoided_closures += n_x == len(XJJ) while n_x < len(XJJ): cycles2 += 1 # sys.stdout.flush() if XSS is None: cache = [] XSS = check(X, XJJ, tuples, n_atts, cache, rand_tuples) cache.sort(key=len) # cache = sorted(cache.items(), key=lambda k: len(k[1])) # sys.stdout.flush() if len(XJJ) == len(XSS): fdt.add_fd(X, XJJ) break else: gp = cache.pop() n_gp = len(g_prime) XJ.add(n_gp) for i in stack[1:]: i[1].add(n_gp) for x in gp: m_prime[x].add(n_gp) # print ('\t', gp) g_prime.append(gp) nid = alg.add_intent_iteration(gp) alg.add_object(n_gp, nid) # alg.objects[nid].append(n_gp) # print(len(list(alg.get_jips())), '/', n_gp+1) rem = set(alg.non_jip_objects()) for m in range(n_atts): m_prime[m].difference_update(rem) # print() # print (alg.inv_lat[nid]) XJJ.intersection_update(gp) if not bool(XJJ - X) or m_i <= min(XJJ - X): m_i = U[-1] X = XJJ else: X.difference_update([m for m in X if m > m_i]) stack[-1][1] = XJ X, m_i, c = fast_next_closure(X, U, fdt.l_close, m_i, stack) ncls += c stack[-1][0] = m_i # print ('--') # for g in g_prime: # print (g) L = list(fdt.read_fds()) print("\nN_FDS:{}".format(len(L))) print("SAMPLING CONTEXT SIZE:{}".format(len(g_prime))) print("CYCLES:", cycles) print("GOOD CLOSURES:", avoided_closures) print("Closures:", ncls) print(fdt.recursions) # print(alg.elements) jip = alg.jip_objects print(jip) # print ([alg.objects[i] for i in jip]) print("JIP", len(jip))
def attribute_exploration_pps(tuples): U = range(len(tuples[0])) # Attributes n_atts = len(U) # Number of attributes # rand_tuples = list(range(len(tuples))) # rand_tuples.sort(key=lambda i: len(set(tuples[i]))) print("Processing data... ", end='') sys.stdout.flush() representations = [[row[j] for row in tuples] for j in U] print("done") plis = [(build_pli(r), ri) for ri, r in enumerate(representations)] stats = Stats() # ORDERING print("Ordering... ", end='') sys.stdout.flush() # ATTRIBUTE ORDERING plis.sort(key=lambda k: k[0], reverse=False) # Lexicographic order = {j[1]: i for i, j in enumerate(plis)} #Original order -> new order inv_order = {i: j for j, i in order.items() } # At position i should be attribute j plis = [i[0] for i in plis] print("done") print("Reconverting... ", end='') tuples = [[None] * n_atts for i in range(len(tuples))] # print(plis) # not_none = [0 for i in range(len(tuples))] for att in range(n_atts): att = inv_order[att] for i, cluster in enumerate(plis[att]): for row in cluster: tuples[row][att] = i print("done") # # END ORDERING Mjs = [set() for i in range(n_atts)] # Needed by fast version of next_closure stack = [[None, None], [None, set([]), Mjs]] # Stack for next_closure X = set([]) fdt = FDTree(U) m_i = -1 cycles = 0 cycles2 = 0 XJ = set([]) ncls = 0 sU = set(U) while X != U: # Feedback Output cycles += 1 if cycles % 1 == 0: print("\rFDs:{}/{}".format(fdt.n_fds, cycles), ','.join(map(str, sorted(X))), end='') #stack sys.stdout.flush() # Stack re-use cache = [] XSS = set(U) check(X, XSS, tuples, n_atts, cache, plis, stats) if len(X) != len(XSS): fdt.add_fd(X, XSS) stack[-1][-1][m_i] = XSS if not bool(XSS - X) or m_i <= min(XSS - X): m_i = U[-1] X = XSS else: X.difference_update([m for m in X if m > m_i]) stack[-1][1] = XJ X, m_i = fast_next_closure(X, U, fdt.l_close, m_i, stats, stack) stack[-1][0] = m_i L = list(fdt.read_fds()) print("\nN_FDS:{}".format(len(L))) print("CYCLES:", cycles) print("Closures:", ncls) print(fdt.recursions)
def attribute_exploration_pps(tuples): U = range(len(tuples[0])) # Attributes n_atts = len(U) m_prime = [set([]) for i in range(len(U))] g_prime = [] rand_tuples = list(range(len(tuples))) # random.shuffle(rand_tuples) rand_tuples.sort(key=lambda i: len(set(tuples[i]))) fctx = FormalContext(g_prime, m_prime) m_prime.append(set([])) # THIS SHOULD BE AFTER DECLARING THE FORMAL CONTEXT representations = [[row[j] for row in tuples] for j in U] # ATTRIBUTE ORDERING order = [(len(set(r)), ri) for ri, r in enumerate(representations)] order.sort(key=lambda k: k[0], reverse=False) #print (order) order = {j[1]:i for i,j in enumerate(order)} #Original order -> new order inv_order = {i:j for j,i in order.items()} for ti, t in enumerate(tuples): tuples[ti] = [t[inv_order[i]] for i in range(len(t))] # # END ORDERING # VARIABLES FOR FAST STACKED NEXT CLOSURE Mjs = [set() for i in range(n_atts)] stack = [[None, m_prime[-1]],[None, set([]), Mjs, True]] # INITIALIZATION VARIABLES X = set([]) fdt = FDTree(U) m_i = -1 # WE START WITH THE EMPTY INTENT REPRESENTED BY THIS # COUNTERS TO KEEP SOME PERFORMANCE STATISTICS cycles = 0 cycles2 = 0 avoided_closures = 0 ncls = 0 while X != U: cycles += 1 # if cycles%1000 == 0: print ("{}||FDs:{}/{}/{}/{}/{} - {: <100}".format(stack[-2][-1], fdt.n_fds, cycles, cycles2, len(g_prime), (sum([len(mp) for mp in m_prime]))/len(m_prime), ','.join(map(str, sorted(X)))), end='\n') #stack sys.stdout.flush() XJ = stack[-2][1].intersection(m_prime[m_i]) if bool(XJ): # XJJ = reduce(set.intersection, (g_prime[g] for g in XJ)) XJJ = set.intersection(*[g_prime[g] for g in XJ]) if len(XJ) == 1: XJJ = set(XJJ) else: XJJ = set(U) # AT THIS POINT WE HAVE XJJ WHICH IS OUR ESTIMATION OF THE CLOSURE # USING THE REPRESENTATION CONTEXT CALCULATED SO FAR # THE ACTUAL CLOSURE SHOULD BE XSS, HOWEVER IF # X = XJJ WE KNOW THAT XSS = XJJ AND WE CAN AVOID ITS # CALCULATION XSS = None n_x = len(X) avoided_closures += n_x == len(XJJ) while n_x < len(XJJ): # CHECKS WHETHER X==XJJ cycles2 += 1 if XSS is None: cache = [] XSS = check(X, XJJ, tuples, n_atts, cache, rand_tuples) cache.sort(key=len) if len(XJJ) == len(XSS): fdt.add_fd(X, XJJ) print('\t', X, XJJ-X) for i in stack[1:]: i[-1] = False # print(stack) break else: gp = cache.pop() n_gp = len(g_prime) XJ.add(n_gp) for i in stack[1:]: i[1].add(n_gp) for x in gp: m_prime[x].add(n_gp) g_prime.append(gp) XJJ.intersection_update(gp) new_atts = XJJ - X if not bool(new_atts) or m_i <= min(new_atts): m_i = U[-1] X = XJJ else: # print(stack) stack[-2][2][m_i] = XJJ X.difference_update([m for m in X if m > m_i]) stack[-1][1] = XJ X, m_i,c = fast_next_closure(X, U, fdt.l_close, m_i, stack) stack[-1].append(True) ncls += c stack[-1][0] = m_i # print ('--') # for g in g_prime: # print (g) L = list(fdt.read_fds()) print ("\nNUMBER OF FDS:{}".format(len(L))) print ("SAMPLING CONTEXT SIZE:{}".format(len(g_prime))) print ("CYCLES:",cycles) print ("GOOD CLOSURES:", avoided_closures) print ("Closures:", ncls) print(fdt.recursions)