def test_dq(): u.init_log('test_dq', True) u.mkdirs(gl.OUT_DIR, True) u.log_print() u.log_print("Test dq no header", dashes=100) ttry(td.dq_t, g.E_MH, gl.IN_MH, gl.IN12, gl.OUT1) ttry(td.dq_t, g.E_DH, gl.IN11, gl.IN_DH, gl.OUT1) u.log_print("Test dup key", dashes=100) td.dq_t(gl.IN_DK, gl.IN12, gl.OUT1, tpd=True) u.log_print("Test different files comparison", dashes=100) dq.file_match(gl.REF1_F, gl.REF2_F, err=False, out_path=gl.OUT_FM) dq.file_match(gl.OUT_FM, gl.REF_FDM) u.log_print("Test dq No. 1", dashes=100) td.dq_t(gl.IN11, gl.IN12, gl.OUT1, gl.REF1, 100, gl.REF_DUP1, sl=10) td.dq_t(gl.IN11, gl.IN12, gl.OUT1, gl.REF1, 15, gl.REF_DUP1) td.dq_t(gl.IN11, gl.IN12, gl.OUT1, gl.REF1_E, eq=True) u.log_print("Test dq No. 2", dashes=100) td.dq_t(gl.IN21, gl.IN22, gl.OUT2, gl.REF2, 100, gl.REF_DUP2, 2) td.dq_t(gl.IN21, gl.IN22, gl.OUT2, gl.REF2, 15, gl.REF_DUP2, 2) td.dq_t(gl.IN21, gl.IN22, gl.OUT2, gl.REF2_E, eq=True) u.log_print("Test dq No. 3", dashes=100) td.dq_t(gl.IN31, gl.IN32, gl.OUT3, gl.REF3, 15) td.dq_t(gl.IN31, gl.IN32, gl.OUT3, gl.REF3_E, eq=True) td.dq_t(gl.IN31, gl.IN32, gl.OUT3, gl.REF3, 100, tps=True, mls=6) td.file_match(gl.REF_SPLIT_3, gl.OUT_SPLIT_3) u.check_log(td.CL)
def init_globals(): get_footprint() TMP_DIR = u.g.dirs['TMP'] + gl.TMP_FOLDER + gl.footprint + '/' u.mkdirs(TMP_DIR) gl.OUT_LEFT = TMP_DIR + gl.OUT_LEFT_FILE gl.OUT_RIGHT = TMP_DIR + gl.OUT_RIGHT_FILE gl.OUT_SQL = TMP_DIR + gl.OUT_SQL_FILE
def init(kwargs): from .connect import connect from .init import init_gl u.init_kwargs(gl, kwargs) init_gl() u.mkdirs(gl.TMP_DIR) gl.ref_chunk = 0 gl.c_main = 0 gl.c_chunk = 0 gl.cnx = connect() gl.c = gl.cnx.cursor() gl.data = []
def move_tmp_folder(): gl.MERGE_OK = False out_dir = gl.OUT_DIR u.mkdirs(out_dir, True) u.log(f"Output folder {out_dir} created") file_list = u.list_files(gl.TMP_DIR, False) n = len(file_list) u.log(f"Moving {n} files to the output folder....") for elt in file_list: cur_path = gl.TMP_DIR + elt target_path = out_dir + elt move(cur_path, target_path) u.log(f"Files moved to {out_dir}")
def test_tools(): u.init_log('test_tools', True) u.mkdirs(gl.OUT_DIR, True) u.log_print() u.log_print("Test tools.xml", dashes=100) tt.parse_xml() dq.file_match(gl.XML_OUT, gl.XML_OUT_REF) u.log_print("Test toolSplit", dashes=100) tt.split() u.log_print("Test toolDup - to.find_dup simple", dashes=100) to.find_dup(gl.DUP_IN, gl.DUP_OUT) u.log_print() dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF) u.log_print("Test toolDup - to.find_dup col", dashes=100) to.find_dup(gl.DUP_COL_IN, col=1) u.log_print() dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF) u.log_print("Test toolDup - to.del_dup + shuffle", dashes=100) to.shuffle_file(gl.DUP_IN, gl.SHUF_OUT) u.log_print() to.del_dup(gl.SHUF_OUT, gl.DUP_OUT) u.log_print() dq.file_match(gl.DUP_OUT, gl.DEL_DUP_OUT_REF) u.log_print("Test toolDup - to.find_dup_list", dashes=100) list_in = u.load_csv(gl.DUP_IN) dup_list = to.find_dup_list(list_in) u.save_csv(dup_list, gl.DUP_OUT) dq.file_match(gl.DUP_OUT, gl.DUP_OUT_REF) u.log_print("Test toolFilter", dashes=100) tt.flt() u.log_print("Test BF", dashes=100) tt.read_big_file() tt.search_big_file() bf.sort_big_file(ts.gl.IN, gl.SORT_BF_OUT) dq.file_match(ts.gl.IN, gl.SORT_BF_OUT, del_dup=True) u.check_log(tt.CL)
def init_find_dup(in_path, out_path, col): if not out_path: tmp_dir = u.g.dirs['TMP'] + gl.TMP_FOLDER u.mkdirs(tmp_dir) out_path = tmp_dir + gl.TMP_OUT s = "Searching duplicates in " if col == 0: u.log(f"{s} file {in_path}") cur_list = u.load_txt(in_path) else: u.log(f"{s}column no. {col} of file {in_path}") cur_list = u.load_csv(in_path) cur_list = [x[col - 1] for x in cur_list] if u.has_header(cur_list): cur_list = cur_list[1:] return (cur_list, out_path)
def recover(): file_list = u.list_files(gl.TMP_DIR, False) a = len(file_list) if a == 0: return s = "Work in progress detected. Recover? (y/n)" if gl.TEST_RECOVER: u.log(s) u.log_print("y (TEST_RECOVER = True)") elif u.log_input(s) == 'n': u.mkdirs(gl.TMP_DIR, True) return modify_ql(file_list) u.log("Query list modified according previous work in progress. " f"Recovering from query '{gl.QUERY_LIST[0][1]}'.")
def test_rl(): u.init_log('test_rl', True) if not ts.is_test_db_defined(): return u.mkdirs(gl.TMP_DIR, True) u.mkdirs(ts.gl.TMP_DIR, True) u.mkdirs(gl.OUT_DIR, True) u.log_print() u.log_print('Test join', dashes=100) tr.left_join_files(gl.LEFT_1, gl.RIGHT_1, gl.OUT_JOIN_REF_1) tr.left_join_files(gl.LEFT_2, gl.RIGHT_2, gl.OUT_JOIN_REF_2) tr.left_join_files(gl.LEFT_3, gl.RIGHT_3, gl.OUT_JOIN_REF_3) u.log_print('Preparing DB', dashes=100) ts.upload(ts.gl.IN) arr = u.load_csv(ts.gl.IN) arr = [elt[0] for elt in arr] u.save_csv(arr, gl.IN_1) u.log_print('Test rl - no sql output', dashes=100) t.ttry(tr.reqlist, u.g.E_VA, gl.IN_1, gl.OUT_1, gl.QUERY_NO) u.log_print('Test rl - no var in query', dashes=100) t.ttry(tr.reqlist, u.g.E_MV, gl.IN_1, gl.OUT_1, gl.QUERY_MV) u.log_print('Test rl - missing header', dashes=100) u.save_csv(arr[1:], gl.IN_MH) t.ttry(tr.reqlist, u.g.E_MH, gl.IN_MH, gl.OUT_1, gl.QUERY_1) u.log_print('Test rl - standard', dashes=100) tr.reqlist(gl.IN_1, gl.OUT_1, gl.QUERY_1, cnx=1) tr.reqlist(gl.OUT_1, gl.OUT_2, gl.QUERY_2) dq.file_match(ts.gl.IN, gl.OUT_2, del_dup=True) dq.file_match(t.gl.OUT_DUP_TMP, gl.OUT_DUP_REF) u.log_print('Test rl - interuption and recovery', dashes=100) u.mkdirs(gl.TMP_DIR, True) u.log_print() args = [gl.OUT_1, gl.OUT_3, gl.QUERY_2] tr.reqlist_interrupted(*args, cnx=6) tr.reqlist(gl.OUT_1, gl.OUT_3, gl.QUERY_2, True, cnx=6) dq.file_match(gl.OUT_2, gl.OUT_3) ts.clean_db([ts.gl.T_TEST]) u.check_log(tr.CL)
def save_mail(HTMLbody): u.mkdirs(gl.mail_dir) gl.last_sent = gl.mail_dir + 'last_sent.html' u.save_list([HTMLbody], gl.last_sent) u.log(f"Mail saved to {gl.last_sent}")
def reset(): u.log("Resetting folders...") u.mkdirs(gl.TMP_DIR, True) u.mkdirs(gl.OUT_DIR, True) u.log("Reset over\n")
def init_tmp_dir(): gl.TMP_DIR = u.g.dirs['TMP'] + gl.TMP_FOLDER u.mkdirs(gl.TMP_DIR, True)