def get_counts_augment_R1_all(R1_name, type_of_tuples): aug_q = [] for i in type_of_tuples: l = [i[0], [i[1]], [i[2]]] l.extend([-1, -1]) aug_q.append(l) aug_counts = [] try: connect_str = conn_str.get_conn_str() conn = psycopg2.connect(connect_str) cursor = conn.cursor() for tup in aug_q: q = "SELECT COUNT(*) FROM " + R1_name + " WHERE AGEP>=" + str(tup[0][0]) + " AND AGEP<=" + str(tup[0][1]) +\ " AND RELP=" + str(tup[1][0]) + " AND SEX=" + str(tup[2][0]) + ";" cursor.execute(q) aug_counts.append(cursor.fetchone()[0]) conn.commit() cursor.close() conn.close() except Exception as e: print("Uh oh, can't connect. Invalid dbname, user or password?\n") print(e) return aug_q, aug_counts
def update_V1_root(S_CC, T, r): try: connect_str = conn_str.get_conn_str() conn = psycopg2.connect(connect_str) cursor = conn.cursor() cc = S_CC[r] b = T.nodes[r]['target'] b_sum_children = 0 query = "WITH num_tuples_per_x AS (SELECT * FROM V1 WHERE PUMA10 = -1 AND TEN = -1" for i in range(num_R1_non_key_attrs): if cc[i] != -1: attr = attr_names[i] if i == 0: query += " AND " + attr + " >= " + str( cc[i][0]) + " AND " + attr + " <= " + str(cc[i][1]) else: query += " AND " + attr + " = " + str(cc[i][0]) # When not at the leaf node, must update the SELECT condition for child in T.successors( r ): # Path graph == 1 successor; OTHERWISE CC at root in subtree cc_child = S_CC[child] query += " AND NOT(" for i in range(num_R1_non_key_attrs): if cc_child[i] != -1: attr = attr_names[i] if i == 0: query += attr + " >= " + str( cc_child[i][0]) + " AND " + attr + " <= " + str( cc_child[i][1]) + " AND " else: query += attr + " = " + str(cc_child[i][0]) + " AND " query = query[:-5] + ")" b_sum_children += T.nodes[child]['target'] # Must use difference between targets of the current CC and it's successor CC in T query += " LIMIT " + str(b - b_sum_children) + ") UPDATE V1 SET " for j in range(num_R2_non_key_attrs): if cc[num_R1_non_key_attrs + j] != -1: query += attr_names[num_R1_non_key_attrs + j] + " = " + str( cc[num_R1_non_key_attrs + j][0]) + ", " query = query[:-2] + " FROM num_tuples_per_x WHERE V1.p_id=num_tuples_per_x.p_id" print("\t\t", query) cursor.execute(query) conn.commit() cursor.close() conn.close() except Exception as e: print("Uh oh, can't connect. Invalid dbname, user or password?\n") print(e) return
def get_counts_augment_R1_3way(S_CC, agep_intervals_used_in_S_CC, num_R1_non_key_attrs, R1_name): aug_q = [] aug_counts = [] # for constraint in S_CC: use constraint[:3] to track combos of AGEP-RELP-SEX values to use when augmenting S_CC combos = [] for constraint in S_CC: k = constraint[:num_R1_non_key_attrs] if k not in combos: combos.append(k) agep_vals = [k[0]] if k[0] == -1: agep_vals = agep_intervals_used_in_S_CC # [[l_1, u_1], [l_2, u_2], ...] relp_vals = k[1] if k[1] == -1: relp_vals = attr_intervalization[1] sex_vals = k[2] if k[2] == -1: sex_vals = attr_intervalization[2] print("\n%s" % (k)) for a1 in agep_vals: for a2 in relp_vals: for a3 in sex_vals: q = [a1, [a2], [a3], -1, -1] if q not in aug_q: aug_q.append(q) print("\t%s" % (aug_q[-1])) try: connect_str = conn_str.get_conn_str() conn = psycopg2.connect(connect_str) cursor = conn.cursor() for tup in aug_q: q = "SELECT COUNT(*) FROM " + R1_name + " WHERE AGEP>=" + str(tup[0][0]) + " AND AGEP<=" + str(tup[0][1]) +\ " AND RELP=" + str(tup[1][0]) + " AND SEX=" + str(tup[2][0]) + ";" cursor.execute(q) aug_counts.append(cursor.fetchone()[0]) conn.commit() cursor.close() conn.close() except Exception as e: print("Uh oh, can't connect. Invalid dbname, user or password?\n") print(e) return aug_q, aug_counts
def get_existing_tuple_types(R1_name): type_of_tuples = [] try: connect_str = conn_str.get_conn_str() conn = psycopg2.connect(connect_str) cursor = conn.cursor() cursor.execute("SELECT AGEP, RELP, SEX FROM " + R1_name + " GROUP BY AGEP, RELP, SEX ORDER BY SEX, AGEP, RELP") distinct_tuples = cursor.fetchall() for i in distinct_tuples: type_of_tuples.append([int(i[0]), int(i[1]), int(i[2])]) conn.commit() cursor.close() conn.close() except Exception as e: print("Uh oh, can't connect. Invalid dbname, user or password?\n") print(e) return type_of_tuples
def update_V1_all_disjoint(S_CC, T): try: connect_str = conn_str.get_conn_str() conn = psycopg2.connect(connect_str) cursor = conn.cursor() for vx in T: cc = S_CC[vx] b = T.nodes[vx]['target'] query = "WITH num_tuples_per_x AS (SELECT * FROM V1 WHERE PUMA10 = -1 AND TEN = -1" for i in range(num_R1_non_key_attrs): if cc[i] != -1: attr = attr_names[i] if i == 0: query += " AND " + attr + " >= " + str( cc[i][0]) + " AND " + attr + " <= " + str(cc[i][1]) else: query += " AND " + attr + " = " + str(cc[i][0]) query += " LIMIT " + str(b) + ") UPDATE V1 SET " for j in range(num_R2_non_key_attrs): if cc[i + 1 + j] != -1: query += attr_names[i + 1 + j] + " = " + str( cc[i + 1 + j][0]) + ", " query = query[:-2] + " FROM num_tuples_per_x WHERE V1.p_id=num_tuples_per_x.p_id" print("\t\t", query) cursor.execute(query) conn.commit() cursor.close() conn.close() except Exception as e: print("Uh oh, can't connect. Invalid dbname, user or password?\n") print(e) return
def main(CC_q_filename, V1_CC_filename, output_filename): ''' ----------------------------------------------- List vars ------------------------------------------------ ''' S_CC = [] b_S_CC = [] total = 0 num_CCs = 0 disjoint_CC = {} intersect_with = {} T = nx.DiGraph() intersecting_S_CC = [] intersecting_b_S_CC = [] edges_to_remove = [] used_tens = [] # TEN vals in S_CC used_tens_without_puma = [] tens_onlyWithPUMA10_or_unused = [] used_pumas = [] # PUMA10 vals in S_CC used_pumas_without_ten = [] pumas_onlyWithTEN_or_unused = [] ten_puma_combos_in_CCs = [] # TEN-PUMA10 combinations in S_CC ten_dict = {} # Combinations "indexed" by TEN vals puma_dict = {} # Combinations "indexed" by PUMA10 vals unused_combos_in_CCs_by_puma = [ ] # Candidate PUMA10 vals for tuples missing assignment after lp.py err = -1 l_ppl_in_diff_puma_ten = {} l_hhs_in_diff_puma_ten = {} io_stats = open(output_filename, "a+") io_stats.write( "-------------------------------------------------------------------------------------------------------------------------------------\n" ) io_stats.write("Create V2 ---- start time %s\n" % (datetime.datetime.now())) try: connect_str = conn_str.get_conn_str() conn = psycopg2.connect(connect_str) cursor = conn.cursor() cursor.execute("DROP TABLE IF EXISTS V2 CASCADE") cursor.execute("CREATE TABLE V2 AS SELECT h_id, PUMA10, TEN FROM " + h_table) cursor.execute("DROP TABLE IF EXISTS V1 CASCADE") cursor.execute("CREATE TABLE V1 AS SELECT * FROM " + p_table) cursor.execute("ALTER TABLE V1 ADD COLUMN PUMA10 INTEGER DEFAULT -1") cursor.execute("ALTER TABLE V1 ADD COLUMN TEN INTEGER DEFAULT -1") # Also, create the final relation, i.e., p_with_hid(p_id, AGEP, RELP, SEX, h_id) cursor.execute("DROP TABLE IF EXISTS p_with_hid CASCADE") cursor.execute("CREATE TABLE p_with_hid AS SELECT * FROM " + p_table) cursor.execute( "ALTER TABLE p_with_hid ADD COLUMN h_id INTEGER DEFAULT -1") cursor.execute("SELECT COUNT(*) FROM " + h_table) n_hh = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM " + p_table) n_ppl = cursor.fetchone()[0] lp.save_num_HHs_ppl(n_hh, n_ppl) '''cursor.execute("SELECT array(SELECT DISTINCT TEN FROM " + h_table + " ORDER BY TEN)") active_tens = list(cursor.fetchone()[0]) cursor.execute("SELECT array(SELECT DISTINCT PUMA10 FROM " + h_table + " ORDER BY PUMA10)") active_pumas = list(cursor.fetchone()[0])''' conn.commit() cursor.close() conn.close() except Exception as e: print("Uh oh, can't connect. Invalid dbname, user or password?\n") print(e) io_stats.write("Create V2 ---- end time %s\n" % (datetime.datetime.now())) io_stats.write( "----------------------------------------------------------------------------\n\n" ) ''' -------------------------------------------------------------------------------------------------------------- Read S_CC -------------------------------------------------------------------------------------------------------------- ''' V1_CCs = open(V1_CC_filename, "r") for line in V1_CCs: k_v = line.rstrip().split(":") total += int(k_v[1]) l = k_v[0].split(",") constraint_l = [] t_p_combo = [] for i in range(5): if l[i] == '-1': constraint_l.append(-1) if i == 3 or i == 4: t_p_combo.append(-1) else: l[i] = l[i][1:-1].split("-") # Remove "[", "]" l[i] = [int(j) for j in l[i]] constraint_l.append(l[i]) # Populate used_tens and used_pumas if i == 3: if l[i][0] not in used_tens: used_tens.append(l[i][0]) t_p_combo.append(l[i][0]) elif i == 4: if l[i][0] not in used_pumas: used_pumas.append(l[i][0]) t_p_combo.append(l[i][0]) if -1 not in t_p_combo and t_p_combo not in ten_puma_combos_in_CCs: ten_puma_combos_in_CCs.append(t_p_combo) else: ten = t_p_combo[0] puma = t_p_combo[1] if ten != -1 and ten not in used_tens_without_puma: used_tens_without_puma.append(ten) if puma != -1 and puma not in used_pumas_without_ten: used_pumas_without_ten.append(puma) S_CC.append(constraint_l) b_S_CC.append(int(k_v[1])) V1_CCs.close() num_CCs = len(S_CC) for i in range(num_CCs): print("%5d. %s" % (i, S_CC[i])) print("\nTotal CC target = %s\n" % (total)) unused_tens = sorted(list(set(dom_ten).difference(set(used_tens)))) unused_pumas = sorted(list(set(dom_puma).difference(set(used_pumas)))) print("\nTEN values NOT used in S_CC: %s\n" % (unused_tens)) print("PUMA10 values NOT used in S_CC: %s\n" % (unused_pumas)) print("TEN values used WITHOUT PUMA10 in some CC: %s\n" % (used_tens_without_puma)) print("PUMA10 values used WITHOUT TEN in some CC: %s\n\n" % (used_pumas_without_ten)) for [t, p] in ten_puma_combos_in_CCs: #if t not in used_tens_without_puma: if t not in ten_dict: ten_dict[t] = [p] else: ten_dict[t].append(p) #if p not in used_pumas_without_ten: if p not in puma_dict: puma_dict[p] = [t] else: puma_dict[p].append(t) pumas_onlyWithTEN_or_unused = list( set(dom_puma).difference(set(used_pumas_without_ten))) tens_onlyWithPUMA10_or_unused = list( set(dom_ten).difference(set(used_tens_without_puma))) for p in pumas_onlyWithTEN_or_unused: # If p does not appear with every TEN val OR p is not used and there exist TEN vals that appear ONLY WITH PUMA10 in S_CC if (p in puma_dict and len(puma_dict[p]) < size_dom_ten) or ( p in unused_pumas and len(tens_onlyWithPUMA10_or_unused) > 0): unused_combos_in_CCs_by_puma.append(p) ''' -------------------------------------------------------------------------------------------------------------- Construct T and \cap_i -------------------------------------------------------------------------------------------------------------- ''' io_stats.write( "Pairwise comparisons on S_CC and constructing T ---- start time %s\n" % (datetime.datetime.now())) lp.save_time(datetime.datetime.now()) for i in range(num_CCs): T.add_node(i, name="CC" + str(i), target=b_S_CC[i]) disjoint_CC = {i: [] for i in range(num_CCs)} for i in range(num_CCs): for j in range(num_CCs): if i != j: R1_i = S_CC[i][:num_R2_non_key_attrs + 1] R1_j = S_CC[j][:num_R2_non_key_attrs + 1] R2_i = S_CC[i][-1 * num_R2_non_key_attrs:] R2_j = S_CC[j][-1 * num_R2_non_key_attrs:] disjoint = False contain = True # R1 for k in range(num_R1_non_key_attrs): R1_i_attr = R1_i[k] R1_j_attr = R1_j[k] if R1_i_attr != -1 and R1_j_attr != -1: # Disjoint on R1? Enough to disagree on ONE attr used in the CCs if (k == 0 and (R1_i_attr[1] < R1_j_attr[0] or R1_i_attr[0] > R1_j_attr[1])) or \ (k != 0 and R1_i_attr != R1_j_attr): disjoint = True # Contained on R1? Then check if contained on R2! if not((k == 0 and R1_i_attr[0] >= R1_j_attr[0] and R1_i_attr[1] <= R1_j_attr[1]) or \ (k != 0 and R1_i_attr == R1_j_attr)): contain = False elif R1_j_attr != -1 and R1_i_attr == -1: contain = False # R2 if contain: for k in range(num_R2_non_key_attrs): R2_i_attr = R2_i[k] R2_j_attr = R2_j[k] # If R1 preds match but disjoint R2 preds ----> Pair of CCs is disjoint, and NOT intersecting if R1_i == R1_j: if R2_i_attr != -1 and R2_j_attr != -1 and R2_i_attr != R2_j_attr: disjoint = True contain = False break # If R1_i is more restrictive than R1_j, BUT R2_i is not more restrictive than R2_j ----> NOT contained else: if (R2_i_attr != -1 and R2_j_attr != -1 and R2_i_attr != R2_j_attr) or \ (R2_j_attr != -1 and R2_i_attr == -1): contain = False break # (Disjoint) Add pairs to dict if disjoint: disjoint_CC[i].append(j) # (Containment) Add edge from CC_j to CC_i IF CC_i\subseteq CC_j AND they do NOT contradict on R2 elif not disjoint and contain: print("Add edge %s -> %s" % (j, i)) T.add_edge(j, i) print() for i in range(num_CCs): for j in range(num_CCs): if i != j: if j not in disjoint_CC[i] and not (T.has_edge(i, j) or T.has_edge(j, i)): if i not in intersect_with: intersect_with[i] = [j] else: intersect_with[i].append(j) ''' ----------------------------------- Find and remove redundant edges from T ----------------------------------- ''' #plot_forest(T) list_of_edges = copy.deepcopy(T.edges()) for (u, v) in list_of_edges: T.remove_edge(u, v) #plot_forest(T) if not nx.has_path( T, u, v): # If removing the edge disconnects v from u, add edge back T.add_edge(u, v) #plot_forest(T) io_stats.write( "Pairwise comparisons on S_CC and constructing T ---- end time %s\n\n" % (datetime.datetime.now())) lp.save_time(datetime.datetime.now()) ''' -------------------------------------------------------------------------------------------------------------- ViewCompletion (hierarchical) -------------------------------------------------------------------------------------------------------------- ''' # Find "good" trees T_i in the forest T to recurse on vxs = copy.deepcopy(T.nodes()) for vx in vxs: if vx in T and T.in_degree(vx) == 0: # At ROOT! bad_T_i = False comp_nodes = list(nx.dfs_postorder_nodes( T, vx)) # Get nodes in root's tree for vx in comp_nodes: if vx in intersect_with: bad_T_i = True break if bad_T_i: # T_i bad ----> Remove T_i from T for vx in comp_nodes: T.remove_node(vx) ''' --------------------------- S_CC and b_S_CC for intersecting CCs ------------------------- ''' intersecting_S_CC.append(S_CC[vx]) intersecting_b_S_CC.append(b_S_CC[vx]) print("%5d %s" % (intersecting_b_S_CC[-1], intersecting_S_CC[-1])) #plot_forest(T) # START V1 COMPLETION ... io_stats.write("Algo 1.1 ---- start time %s\n" % (datetime.datetime.now())) lp.save_time(datetime.datetime.now()) view_completion(S_CC, T) try: connect_str = conn_str.get_conn_str() conn = psycopg2.connect(connect_str) cursor = conn.cursor() #--------------------------------- FIX tuples with TEN but no PUMA10 value ----------------------------------- cursor.execute( "SELECT p_id, TEN FROM V1 WHERE PUMA10 = -1 AND TEN != -1") y_ten_n_puma = list(cursor.fetchall()) for tup in y_ten_n_puma: id = tup[0] ten = tup[1] puma = -1 if ten in ten_dict: puma_candidates = list( set(dom_puma).difference(set(ten_dict[ten]))) if len(puma_candidates) > 0: puma = random.choice(puma_candidates) else: print("\n\nNo TEN options\n\n") else: puma = random.choice(pumas_onlyWithTEN_or_unused) query = "UPDATE V1 SET PUMA10 = " + str( puma) + " WHERE p_id = " + str(id) cursor.execute(query) #---------------------------------- FIX tuples with PUMA10 but no TEN value ---------------------------------- cursor.execute( "SELECT p_id, PUMA10 FROM V1 WHERE PUMA10 != -1 AND TEN = -1") n_ten_y_puma = list(cursor.fetchall()) for tup in n_ten_y_puma: id = tup[0] puma = tup[1] ten = -1 if puma in puma_dict: ten_candidates = list( set(dom_ten).difference(set(puma_dict[puma]))) if len(ten_candidates) > 0: ten = random.choice(ten_candidates) else: print("\n\nNo PUMA10 options\n\n") else: ten = random.choice(tens_onlyWithPUMA10_or_unused) query = "UPDATE V1 SET TEN = " + str(ten) + " WHERE p_id = " + str( id) cursor.execute(query) io_stats.write("Algo 1.1 ---- end time %s\n\n" % (datetime.datetime.now())) lp.save_time(datetime.datetime.now()) # CHECK CC VIOLATIONS err = 0 with open(CC_q_filename, "r") as q: lines = q.readlines() for i in range(len(lines)): l = lines[i] k_v = l.rstrip().split(":") pred = k_v[0] target_count = int(k_v[1]) cursor.execute("SELECT COUNT(*) FROM V1 WHERE " + pred) ans = cursor.fetchone()[0] if i in T and target_count != ans: if err == 0: io_stats.write( "\t(Unsatisfied disjoint or contained) CCs with target counts minus counts in V1\n" ) io_stats.write( "\t%5d - %5d = %5d \t%s\n" % (target_count, ans, target_count - ans, pred)) err += abs(target_count - ans) lp.save_tot_L1_err("%s\n" % (err)) conn.commit() cursor.close() conn.close() except Exception as e: print("Uh oh, can't connect. Invalid dbname, user or password?") print(e) io_stats.write( "----------------------------------------------------------------------------\n\n" ) io_stats.close() ''' -------------------------------------------------------------------------------------------------------------- Pass any intersecting CCs to lp.py -------------------------------------------------------------------------------------------------------------- ''' if len(intersecting_S_CC) > 0: lp.main(num_R1_non_key_attrs, intersecting_S_CC, intersecting_b_S_CC, p_table, h_table, CC_q_filename, output_filename) ''' -------------------------------------------------------------------------------------------------------------- 1. Assign unused TEN-PUMA10 combo to tuples in V1 to complete assignment ------------------------------------------------------------------------------------------------------------------ 2. Check CC violations -------------------------------------------------------------------------------------------------------------- ''' io_stats = open(output_filename, "a+") try: connect_str = conn_str.get_conn_str() conn = psycopg2.connect(connect_str) cursor = conn.cursor() io_stats.write( "\n\nPartition people and households by PUMA10-TEN ---- start time %s\n" % (datetime.datetime.now())) lp.save_time("%s" % (datetime.datetime.now())) # CREATE A DICTIONARY THAT PARTITIONS PEOPLE INTO (PUMA, TEN) cursor.execute( "SELECT p_id, TEN, PUMA10 FROM V1 WHERE PUMA10 != -1 AND TEN != -1" ) rows = cursor.fetchall() for row in rows: p_id = row[0] ten = int(row[1]) puma = row[2] if (puma, ten) not in l_ppl_in_diff_puma_ten: l_ppl_in_diff_puma_ten[(puma, ten)] = [p_id] else: l_ppl_in_diff_puma_ten[(puma, ten)].append(p_id) # CREATE A DICTIONARY THAT PARTITIONS HHs INTO (PUMA, TEN) cursor.execute("SELECT h_id, PUMA10, TEN FROM V2") rows = cursor.fetchall() for row in rows: h_id = row[0] puma = row[1] ten = int(row[2]) if (puma, ten) not in l_hhs_in_diff_puma_ten: l_hhs_in_diff_puma_ten[(puma, ten)] = [h_id] else: l_hhs_in_diff_puma_ten[(puma, ten)].append(h_id) #----------------------------------- Complete partially filled tuples in V1 ---------------------------------- cursor.execute("SELECT p_id FROM V1 WHERE PUMA10 = -1 AND TEN = -1") miss_assign_V1 = list(cursor.fetchall()) if len(miss_assign_V1) > 0 and len(unused_combos_in_CCs_by_puma) > 0: print("\n\nNumber of tuples without assignment = %s\n\n" % (len(miss_assign_V1))) io_stats.write("\tComplete V1 assignment ---- start time %s\n" % (datetime.datetime.now())) #------------------------------------ Create list of TEN-PUMA10 combos ----------------------------------- combos = [] for p in unused_combos_in_CCs_by_puma: if p in puma_dict: ts = list(set(dom_ten).difference(set(puma_dict[p]))) else: ts = tens_onlyWithPUMA10_or_unused for t in ts: combos.append([t, p]) print( "Combinations valid for tuples missing assignment after lp.py: %s\n\n" % (combos)) io_stats.write("\t\tConstructing combos (end) %s\n" % (datetime.datetime.now())) for tup in miss_assign_V1: p_id = tup[0] [ten, puma] = random.choice(combos) if (puma, ten) not in l_ppl_in_diff_puma_ten: l_ppl_in_diff_puma_ten[(puma, ten)] = [p_id] else: l_ppl_in_diff_puma_ten[(puma, ten)].append(p_id) io_stats.write("\tComplete V1 assignment ---- end time %s\n" % (datetime.datetime.now())) else: print("\n\nAll TEN-PUMA10 combinations used in S_CC\n\n") io_stats.write( "Partition people and households by PUMA10-TEN ---- end time %s\n" % (datetime.datetime.now())) lp.save_time("%s\n" % (datetime.datetime.now())) # CHECK CC VIOLATIONS io_stats.write("\n\nCCs with target counts minus counts in solution\n") with open(CC_q_filename, "r") as q: err = 0 for l in q: k_v = l.rstrip().split(":") pred = k_v[0] target_count = int(k_v[1]) cursor.execute("SELECT COUNT(*) FROM V1 WHERE " + pred) ans = cursor.fetchone()[0] io_stats.write("%5d - %5d = %5d \t%s\n" % (target_count, ans, target_count - ans, pred)) err += abs(target_count - ans) lp.save_tot_L1_err("%s (Step 1 final)\n" % (err)) conn.commit() cursor.close() conn.close() except Exception as e: print("Uh oh, can't connect. Invalid dbname, user or password?") print(e) io_stats.close() return l_ppl_in_diff_puma_ten, l_hhs_in_diff_puma_ten
def main(CC_q_filename, V1_CC_filename, output_filename): ''' ----------------------------------------------- List vars ------------------------------------------------ ''' type_of_tuples = [] num_types = 0 n_men = 0 n = 0 n_age = 0 n_ten = 0 n_puma = 0 aug_q = [] aug_counts = [] S_CC = [] b_S_CC = [] A = [] io_stats = open(output_filename, "a") io_stats.write( "-------------------------------------------------------------------------------------------------------------------------------------\n" ) io_stats.write("\nCreate V2 ---- start time %s\n" % (datetime.datetime.now())) try: connect_str = conn_str.get_conn_str() conn = psycopg2.connect(connect_str) cursor = conn.cursor() cursor.execute("DROP TABLE IF EXISTS V2 CASCADE") cursor.execute("CREATE TABLE V2 AS SELECT h_id, PUMA10, TEN FROM " + h_table) cursor.execute("DROP TABLE IF EXISTS V1 CASCADE") cursor.execute("CREATE TABLE V1 AS SELECT * FROM " + p_table) cursor.execute("ALTER TABLE V1 ADD COLUMN PUMA10 INTEGER DEFAULT -1") cursor.execute("ALTER TABLE V1 ADD COLUMN TEN INTEGER DEFAULT -1") # Also, create the final relation, i.e., p_with_hid(p_id, AGEP, RELP, SEX, h_id) cursor.execute("DROP TABLE IF EXISTS p_with_hid CASCADE") cursor.execute("CREATE TABLE p_with_hid AS SELECT * FROM " + p_table) cursor.execute( "ALTER TABLE p_with_hid ADD COLUMN h_id INTEGER DEFAULT -1") cursor.execute("SELECT COUNT(*) FROM " + h_table) n_hh = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM " + p_table) n_ppl = cursor.fetchone()[0] save_num_HHs_ppl(n_hh, n_ppl) conn.commit() cursor.close() conn.close() except Exception as e: print("Uh oh, can't connect. Invalid dbname, user or password?\n") print(e) io_stats.write("Create V2 ---- end time %s\n" % (datetime.datetime.now())) io_stats.write( "----------------------------------------------------------------------------\n" ) io_stats.write("CCs to Ax=b ---- start time %s\n" % (datetime.datetime.now())) save_time(datetime.datetime.now()) V1_CCs = open(V1_CC_filename, "r") for line in V1_CCs: k_v = line.rstrip().split(":") l = k_v[0].split(",") constraint_l = [] for i in range(5): if l[i] == '-1': constraint_l.append(-1) else: l[i] = l[i][1:-1].split("-") # Remove "[", "]" l[i] = [int(j) for j in l[i]] constraint_l.append(l[i]) S_CC.append(constraint_l) b_S_CC.append(int(k_v[1])) V1_CCs.close() set_num_S_CCs(len(b_S_CC)) ''' -------------------------------------------------------------------------------------------------------------- Get attr_intervalization for attributes in R1 -------------------------------------------------------------------------------------------------------------- ''' # 1. Store breaking points from the domain of attributes used in S_CC for constraint in S_CC: for i in range(num_R1_non_key_attrs): if constraint[i] != -1: for j in range(len(constraint[i])): val = constraint[i][j] if j == 1: val += 1 # [0, 17] makes list [0, 18] for [0, 18), [18, ...) if val not in attr_intervalization[i]: attr_intervalization[i].append(val) # 2. Sort and adjust AGEP's intervals for i in range(num_R1_non_key_attrs): attr_intervalization[i].sort() if i == 0: # AGEP domain is [0, 114] if 0 not in attr_intervalization[i]: l = [0] l.extend(attr_intervalization[i]) attr_intervalization[i] = l if 115 not in attr_intervalization[i]: attr_intervalization[i].append(115) n_age = len(attr_intervalization[0]) ''' -------------------------------------------------------------------------------------------------------------- Get tuple types and merge using attrs that allow intervals in S_{CC} -------------------------------------------------------------------------------------------------------------- ''' type_of_tuples = get_existing_tuple_types(p_table) print("\nType of tuples before binning by intervals = %s" % (len(type_of_tuples))) io_stats.write("\t\tType of tuples before binning by intervals = %s\n" % (len(type_of_tuples))) # Update the agep value to the corresponding interval from agep intervalization first_woman = False i = 0 for tup in type_of_tuples: if tup[2] == 2 and not first_woman: i = 0 first_woman = True u = attr_intervalization[0][i + 1] while tup[0] >= u: i += 1 u = attr_intervalization[0][i + 1] l = attr_intervalization[0][i] u = attr_intervalization[0][i + 1] tup[0] = [l, u - 1] # Remove duplicates below no_dup = [] for tup in type_of_tuples: if tup not in no_dup: no_dup.append(tup) if tup[2] == 1: n_men += 1 type_of_tuples = no_dup num_types = len(type_of_tuples) print("Number of tuple types in R1 (using AGEP intervalization) = %s" % (num_types)) io_stats.write( "\t\tNumber of tuple types in R1 (using AGEP intervalization) = %s\n" % (num_types)) n = num_types ''' -------------------------------------------------------------------------------------------------------------- Get active domain for attributes in R2, and total number of vars in IP -------------------------------------------------------------------------------------------------------------- ''' try: connect_str = conn_str.get_conn_str() conn = psycopg2.connect(connect_str) cursor = conn.cursor() cursor.execute("SELECT array(SELECT DISTINCT TEN FROM " + h_table + " ORDER BY TEN)") attr_intervalization[3] = list(cursor.fetchone()[0]) n_ten = len(attr_intervalization[3]) if n_ten > 0: n = n * n_ten cursor.execute("SELECT array(SELECT DISTINCT PUMA10 FROM " + h_table + " ORDER BY PUMA10)") attr_intervalization[4] = list(cursor.fetchone()[0]) n_puma = len(attr_intervalization[4]) if n_puma > 0: n = n * n_puma conn.commit() cursor.close() conn.close() except Exception as e: print("Uh oh, can't connect. Invalid dbname, user or password?\n") print(e) print("\t\tDistinct values (TEN, PUMA10):\t%s\n\t\t%s\n" % (attr_intervalization[3], attr_intervalization[4])) print("Number of variables = %s" % (n)) io_stats.write("\n\t\tNumber of variables = %2d\n" % (n)) ''' -------------------------------------------------------------------------------------------------------------- Use S_CC and add the corresponding x_i's to populate rows in A and b ------------------------------------------------------------------------------------------------------------------ DO NOT augment input set of CCs with AGEP-RELP-SEX marginals from R_1 -------------------------------------------------------------------------------------------------------------- ''' aug_q = S_CC aug_counts = b_S_CC for i in aug_q: row = [] if i[4] != -1: idx_puma = attr_intervalization[4].index(i[4][0]) row = [0] * num_types * n_ten * idx_puma if i[0] == -1: row_sub = no_agep(i[1], i[2], type_of_tuples, n_men) else: row_sub = given_agep(i[0], i[1], i[2], type_of_tuples, n_men) if i[3] == -1: row.extend(row_sub * n_ten) elif i[3][0] == 1: row.extend(row_sub) if n_ten >= 2: row.extend([0] * int(n / (n_puma * n_ten) * (n_ten - 1))) # Ignore other TENs elif i[3][0] == 2: row.extend([0] * int(n / (n_puma * n_ten))) row.extend(row_sub) if n_ten >= 3: row.extend([0] * int(n / (n_puma * n_ten) * (n_ten - 2))) # Ignore other TENs elif i[3][0] == 3: row.extend([0] * int(2 * n / (n_puma * n_ten))) row.extend(row_sub) if n_ten == 4: row.extend([0] * int(n / (n_puma * n_ten)) ) # Ignore other TENs ... n_ten - 3 is 1 here elif i[3][0] == 4: row.extend([0] * int(3 * n / (n_puma * n_ten))) row.extend(row_sub) if i[4] != -1: num_pumas_rem = n_puma - attr_intervalization[4].index(i[4][0]) - 1 row.extend([0] * int(n / n_puma) * num_pumas_rem) else: row.extend(row * (n_puma - 1)) A.append(row) print("\nSolving IP ....") prob = LpProblem("myProblem", LpMinimize) ppl_types = [i for i in range(n)] ppl_vars = LpVariable.dicts("person", ppl_types, lowBound=0, cat='Integer') #print(ppl_vars) prob += 0 for i in range(len(A)): prob += lpSum([A[i][j] * ppl_vars[j] for j in ppl_types]) == aug_counts[i], "CC" + str(i) status = prob.solve() io_stats.write("\t\t******** STATUS = %s\n" % (LpStatus[prob.status])) print("\t\t******** STATUS = %s\n" % (LpStatus[prob.status])) x = [0 for i in range(n)] for v in prob.variables(): #print(v.name, "=", v.varValue) if v.name != '__dummy': x[int(v.name.split("_")[1])] = int(v.varValue) io_stats.write("\tCCs to Ax=b ---- end time %s\n\n" % (datetime.datetime.now())) save_time("%s\n" % (datetime.datetime.now())) io_stats.write("\tAx=b to V1 ---- start time %s\n" % (datetime.datetime.now())) save_time(datetime.datetime.now()) # Create UPDATE queries to run on V_1 in PostgreSQL on the basis of x. We know the values x[i]'s encode. try: connect_str = conn_str.get_conn_str() conn = psycopg2.connect(connect_str) cursor = conn.cursor() # AGEP, RELP, SEX, TEN, PUMA10 for p in range(len(attr_intervalization[4])): for t in range(len(attr_intervalization[3])): for i in range(num_types): x_i_val = x[p * n_ten * num_types + t * num_types + i] if x_i_val > 0: agep_val = type_of_tuples[i][0] relp_val = str(type_of_tuples[i][1]) sex_val = str(type_of_tuples[i][2]) ten_val = str(attr_intervalization[3][t]) puma_val = str(attr_intervalization[4][p]) query = "WITH num_tuples_per_x AS (SELECT * FROM V1 WHERE PUMA10=-1 AND AGEP>=" + str(agep_val[0]) + \ " AND AGEP<=" + str(agep_val[1]) + " AND RELP=" + relp_val + " AND SEX=" + sex_val + " LIMIT " + \ str(x_i_val) + ") UPDATE V1 SET PUMA10=" + puma_val + ", TEN=" + ten_val + \ " FROM num_tuples_per_x WHERE V1.p_id=num_tuples_per_x.p_id" cursor.execute(query) print( query.split("AS ")[1].split( " FROM num_tuples_per_x")[0]) save_time(datetime.datetime.now()) # COMPLETE TUPLES WITHOUT ANY ASSIGNMENT IN V1 cursor.execute( "SELECT array(SELECT p_id FROM V1 WHERE PUMA10 = -1 AND TEN = -1)") tups_no_assign = list(cursor.fetchone()[0]) print("\nNumber of tuples without assignment = %s\n" % (len(tups_no_assign))) for p_id in tups_no_assign: p = random.choice(dom_puma) t = random.choice(dom_ten) cursor.execute("UPDATE V1 SET PUMA10 = " + str(p) + ", TEN = " + str(t) + " WHERE p_id = " + str(p_id)) io_stats.write("\tAx=b to V1 ---- end time %s\n\n" % (datetime.datetime.now())) save_time(datetime.datetime.now()) # CHECK CC VIOLATIONS FOR CCs err = 0 with open(CC_q_filename, "r") as q: for l in q: k_v = l.rstrip().split(":") pred = k_v[0] target_count = int(k_v[1]) cursor.execute("SELECT COUNT(*) FROM V1 WHERE " + pred) ans = cursor.fetchone()[0] if target_count != ans: if err == 0: io_stats.write( "\tCCs with target counts minus counts in V1\n") io_stats.write( "\t%5d - %5d = %5d \t%s\n" % (target_count, ans, target_count - ans, pred)) err += abs(target_count - ans) save_tot_L1_err("%s (CC)\n" % (err)) conn.commit() cursor.close() conn.close() except Exception as e: print("Uh oh, can't connect. Invalid dbname, user or password?\n") print(e) io_stats.write( "----------------------------------------------------------------------------\n\n\n\n\n" ) io_stats.close()
def main(num_R1_non_key_attrs, S_CC, b_S_CC, p_table, h_table, CC_q_filename, output_filename): io_stats = open(output_filename, "a") io_stats.write("Algo 1.2 ---- start time %s\n" % (datetime.datetime.now())) for i in range(len(S_CC)): io_stats.write("\t%5d %s\n" % (b_S_CC[i], S_CC[i])) io_stats.write("\n\tCCs to Ax=b ---- start time %s\n" % (datetime.datetime.now())) save_time(datetime.datetime.now()) set_num_S_CCs(len(S_CC)) ''' ----------------------------------------------- List vars ------------------------------------------------ ''' type_of_tuples = [] num_types = 0 n_men = 0 n = 0 n_age = 0 n_ten = 0 n_puma = 0 aug_q = [] aug_counts = [] A = [] ''' -------------------------------------------------------------------------------------------------------------- Get attr_intervalization for attributes in R1 -------------------------------------------------------------------------------------------------------------- ''' # 1. Store breaking points from the domain of attributes used in S_CC for constraint in S_CC: for i in range(num_R1_non_key_attrs): if constraint[i] != -1: for j in range(len(constraint[i])): val = constraint[i][j] if j == 1: val += 1 # [0, 17] makes list [0, 18] for [0, 18), [18, ...) if val not in attr_intervalization[i]: attr_intervalization[i].append(val) # 2. Sort and adjust AGEP's intervals for i in range(num_R1_non_key_attrs): attr_intervalization[i].sort() if i == 0: # AGEP domain is [0, 114] if 0 not in attr_intervalization[i]: l = [0] l.extend(attr_intervalization[i]) attr_intervalization[i] = l if 115 not in attr_intervalization[i]: attr_intervalization[i].append(115) n_age = len(attr_intervalization[0]) # Mark which intervals given by AGEP's intervalization are used in S_CC agep_intervals_used_in_S_CC = [] if n_age == 2: # No AGEP value or [0, 114] specified in S_CC agep_intervals_used_in_S_CC.append([0, 114]) else: for i in range(n_age - 1): age = attr_intervalization[0][i] next_age = attr_intervalization[0][i + 1] for constraint in S_CC: if (constraint[0] != -1 and age >= constraint[0][0] and age <= constraint[0][1]) and ([ age, next_age - 1 ] not in agep_intervals_used_in_S_CC): agep_intervals_used_in_S_CC.append([age, next_age - 1]) break print( "\n\nAGEP intervals (post intervalization) that are used in the input set of CCs: %s\n\n" % (agep_intervals_used_in_S_CC)) ''' -------------------------------------------------------------------------------------------------------------- Get tuple types and merge using attrs that allow intervals in S_{CC} -------------------------------------------------------------------------------------------------------------- ''' type_of_tuples = get_existing_tuple_types(p_table) print("\nType of tuples before binning by intervals = %s" % (len(type_of_tuples))) io_stats.write("\t\tType of tuples before binning by intervals = %s\n" % (len(type_of_tuples))) # Update the agep value to the corresponding interval from agep intervalization first_woman = False i = 0 for tup in type_of_tuples: if tup[2] == 2 and not first_woman: i = 0 first_woman = True u = attr_intervalization[0][i + 1] while tup[0] >= u: i += 1 u = attr_intervalization[0][i + 1] l = attr_intervalization[0][i] u = attr_intervalization[0][i + 1] tup[0] = [l, u - 1] # Remove duplicates below no_dup = [] for tup in type_of_tuples: if tup not in no_dup: no_dup.append(tup) if tup[2] == 1: n_men += 1 type_of_tuples = no_dup num_types = len(type_of_tuples) print("Number of tuple types in R1 (using AGEP intervalization) = %s" % (num_types)) io_stats.write( "\t\tNumber of tuple types in R1 (using AGEP intervalization) = %s\n" % (num_types)) n = num_types ''' -------------------------------------------------------------------------------------------------------------- Get active domain for attributes in R2, and total number of vars in IP -------------------------------------------------------------------------------------------------------------- ''' # Once we know the distinct values in TEN and PUMA10 (from S_{CC}), calculate number of variables try: connect_str = conn_str.get_conn_str() conn = psycopg2.connect(connect_str) cursor = conn.cursor() if len(attr_intervalization[1]) == 0: # No RELP value used in S_CC cursor.execute("SELECT array(SELECT DISTINCT RELP FROM " + p_table + " ORDER BY RELP)") attr_intervalization[1] = list(cursor.fetchone()[0]) if len(attr_intervalization[2]) == 0: # No SEX value used in S_CC cursor.execute("SELECT array(SELECT DISTINCT SEX FROM " + p_table + " ORDER BY SEX)") attr_intervalization[2] = list(cursor.fetchone()[0]) cursor.execute("SELECT array(SELECT DISTINCT TEN FROM " + h_table + " ORDER BY TEN)") attr_intervalization[3] = list(cursor.fetchone()[0]) n_ten = len(attr_intervalization[3]) if n_ten > 0: n = n * n_ten cursor.execute("SELECT array(SELECT DISTINCT PUMA10 FROM " + h_table + " ORDER BY PUMA10)") attr_intervalization[4] = list(cursor.fetchone()[0]) n_puma = len(attr_intervalization[4]) if n_puma > 0: n = n * n_puma conn.commit() cursor.close() conn.close() except Exception as e: print("Uh oh, can't connect. Invalid dbname, user or password?\n") print(e) print("\t\tDistinct values (TEN, PUMA10):\t%s\n\t\t%s\n" % (attr_intervalization[3], attr_intervalization[4])) print("Number of variables = %s" % (n)) io_stats.write("\n\t\tNumber of variables = %2d\n" % (n)) ''' -------------------------------------------------------------------------------------------------------------- Augment S_{CC} using 3-way (AGEP-RELP-SEX) marginal from R1 -------------------------------------------------------------------------------------------------------------- ''' aug_q, aug_counts = get_counts_augment_R1_3way( S_CC, agep_intervals_used_in_S_CC, num_R1_non_key_attrs, p_table) aug_q.extend(S_CC) aug_counts.extend(b_S_CC) ''' -------------------------------------------------------------------------------------------------------------- Use S_CC and add the corresponding x_i's to populate rows in A and b -------------------------------------------------------------------------------------------------------------- ''' for i in aug_q: row = [] if i[4] != -1: idx_puma = attr_intervalization[4].index(i[4][0]) row = [0] * num_types * n_ten * idx_puma if i[0] == -1: row_sub = no_agep(i[1], i[2], type_of_tuples, n_men) else: row_sub = given_agep(i[0], i[1], i[2], type_of_tuples, n_men) if i[3] == -1: row.extend(row_sub * n_ten) elif i[3][0] == 1: row.extend(row_sub) if n_ten >= 2: row.extend([0] * int(n / (n_puma * n_ten) * (n_ten - 1))) # Ignore other TENs elif i[3][0] == 2: row.extend([0] * int(n / (n_puma * n_ten))) row.extend(row_sub) if n_ten >= 3: row.extend([0] * int(n / (n_puma * n_ten) * (n_ten - 2))) # Ignore other TENs elif i[3][0] == 3: row.extend([0] * int(2 * n / (n_puma * n_ten))) row.extend(row_sub) if n_ten == 4: row.extend([0] * int(n / (n_puma * n_ten)) ) # Ignore other TENs ... n_ten - 3 is 1 here elif i[3][0] == 4: row.extend([0] * int(3 * n / (n_puma * n_ten))) row.extend(row_sub) if i[4] != -1: num_pumas_rem = n_puma - attr_intervalization[4].index(i[4][0]) - 1 row.extend([0] * int(n / n_puma) * num_pumas_rem) else: row.extend(row * (n_puma - 1)) A.append(row) print("\nSolving IP ....") prob = LpProblem("myProblem", LpMinimize) ppl_types = [i for i in range(n)] ppl_vars = LpVariable.dicts("person", ppl_types, lowBound=0, cat='Integer') #print(ppl_vars) prob += 0 for i in range(len(A)): prob += lpSum([A[i][j] * ppl_vars[j] for j in ppl_types]) == aug_counts[i], "CC" + str(i) status = prob.solve() io_stats.write("\t\t******** STATUS = %s\n" % (LpStatus[prob.status])) print("\t\t******** STATUS = %s\n" % (LpStatus[prob.status])) x = [0 for i in range(n)] for v in prob.variables(): #print(v.name, "=", v.varValue) if v.name != '__dummy': x[int(v.name.split("_")[1])] = int(v.varValue) io_stats.write("\tCCs to Ax=b ---- end time %s\n\n" % (datetime.datetime.now())) save_time(datetime.datetime.now()) io_stats.write("\tAx=b to V1 ---- start time %s\n" % (datetime.datetime.now())) save_time(datetime.datetime.now()) # Create UPDATE queries to run on V_1 in PostgreSQL on the basis of x. We know the values x[i]'s encode. try: connect_str = conn_str.get_conn_str() conn = psycopg2.connect(connect_str) cursor = conn.cursor() # AGEP, RELP, SEX, TEN, PUMA10 for p in range(len(attr_intervalization[4])): for t in range(len(attr_intervalization[3])): for i in range(num_types): x_i_val = x[p * n_ten * num_types + t * num_types + i] if x_i_val > 0: agep_val = type_of_tuples[i][0] relp_val = str(type_of_tuples[i][1]) sex_val = str(type_of_tuples[i][2]) ten_val = str(attr_intervalization[3][t]) puma_val = str(attr_intervalization[4][p]) query = "WITH num_tuples_per_x AS (SELECT * FROM V1 WHERE PUMA10=-1 AND AGEP>=" + str(agep_val[0]) + \ " AND AGEP<=" + str(agep_val[1]) + " AND RELP=" + relp_val + " AND SEX=" + sex_val + " LIMIT " + \ str(x_i_val) + ") UPDATE V1 SET PUMA10=" + puma_val + ", TEN=" + ten_val + \ " FROM num_tuples_per_x WHERE V1.p_id=num_tuples_per_x.p_id" cursor.execute(query) print( query.split("AS ")[1].split( " FROM num_tuples_per_x")[0]) #cursor.execute("SELECT COUNT(*) FROM V1 WHERE PUMA10=-1 OR TEN = -1") #print("\n\nNumber of tuples with missing/incomplete V1 assignment = %s\n\n" %(cursor.fetchone()[0])) io_stats.write("\tAx=b to V1 ---- end time %s\n\n" % (datetime.datetime.now())) # CHECK CC VIOLATIONS FOR CCs err = 0 with open(CC_q_filename, "r") as q: for l in q: k_v = l.rstrip().split(":") pred = k_v[0] target_count = int(k_v[1]) cursor.execute("SELECT COUNT(*) FROM V1 WHERE " + pred) ans = cursor.fetchone()[0] if target_count != ans: if err == 0: io_stats.write( "\tCCs with target counts minus counts in V1\n") io_stats.write( "\t%5d - %5d = %5d \t%s\n" % (target_count, ans, target_count - ans, pred)) err += abs(target_count - ans) io_stats.write("Algo 1.2 ---- end time %s\n" % (datetime.datetime.now())) save_time(datetime.datetime.now()) save_tot_L1_err("%s\n" % (err)) conn.commit() cursor.close() conn.close() except Exception as e: print("Uh oh, can't connect. Invalid dbname, user or password?\n") print(e) io_stats.write( "----------------------------------------------------------------------------\n\n\n\n\n" ) io_stats.close()
V1_CC_filename = "V1_CC_dict_all_10_good.txt" output_filename = "details.txt" lp.main(CC_q_filename, V1_CC_filename, output_filename) io_stats = open(output_filename, "a") io_stats.write( "\n\n\tPartition people and households by PUMA10-TEN ---- start time %s\n" % (datetime.datetime.now())) '''------------------------------------------------------------------------------------------------------------------------ h_id UNKNOWN for our problem ------------------------------------------------------------------------------------------------------------------------''' # CREATE A DICTIONARY THAT PARTITIONS PEOPLE INTO (PUMA, TEN) l_ppl_in_diff_puma_ten = {} try: connect_str = conn_str.get_conn_str() conn = psycopg2.connect(connect_str) cursor = conn.cursor() cursor.execute("SELECT p_id, TEN, PUMA10 FROM V1") rows = cursor.fetchall() for row in rows: p_id = row[0] ten = int(row[1]) puma = row[2] if (puma, ten) not in l_ppl_in_diff_puma_ten: l_ppl_in_diff_puma_ten[(puma, ten)] = [p_id] else: l_ppl_in_diff_puma_ten[(puma, ten)].append(p_id) conn.commit()