def read_big_file(in_path, **kwargs): """Reads a potentially big file See in partools/tools/gl for other parameters (kwargs) See partools/quickstart/tools_bf.py for examples of use """ from .init import init_rbf u.log("[toolBF] read_big_file: start") init_rbf() u.init_kwargs(gl, kwargs) with open(in_path, 'r', encoding='utf-8', errors='ignore') as in_file: line = f.read_file(in_file) u.log_print(line.strip("\n")) while line != "": line = f.read_file(in_file) u.log_print(line.strip("\n")) gl.c_read += 1 if f.check_counter(in_file): continue else: break u.log("[toolBF] read_big_file: end\n")
def left_join_files(lpath='', rpath='', out='', debug=False): """Joints two files (lpath and rpath) on the first column of each file""" from .init import init_globals from .join import left_join_arrays u.log("[rl] left_join_files: start") start_time = time() if debug: gl.DEBUG_JOIN = True if lpath or rpath: init_globals() u.log(f"Loading arrays from '{lpath}' and '{rpath}'...") gl.ar_in = u.load_csv(lpath) ar_right = u.load_csv(rpath) u.log("Arrays loaded") u.log_print('|') else: u.log("Loading right arrays...") ar_right = u.load_csv(gl.OUT_SQL) u.log("Right array loaded") left_join_arrays(gl.ar_in, ar_right) if not out: out = gl.OUT_PATH u.log("Saving output file...") u.save_csv(gl.out_array, out) s = f"Output file saved in {out}" u.log(s) dstr = u.get_duration_string(start_time) u.log(f"[rl] left_join_files: end ({dstr})") u.log_print('|')
def prepare_bdd(): from .execute import execute if gl.EXECUTE_KWARGS: u.log("Preparing DB before data injection...") u.log_print("|") execute(**gl.EXECUTE_KWARGS)
def file_match(in1, in2, del_dup=False, err=True, out_path=''): """Compares two files and outputs the diff if the files don't match. Note that the files are sorted before comparison. (more generic than run_dq but doesn't work for big files) - del_dup: if true, duplicates are deleted before comparison - err: if True, an exception is raised when the files don't match - out_path: specifies an output path for file comparison different from default """ u.log("[dq] file_match: start") if not out_path: out_path = u.g.dirs['OUT'] + 'file_match_out.csv' s = f"Comparing files '{in1}' and '{in2}'..." u.log(s) l1, l2 = u.load_txt(in1), u.load_txt(in2) l1.sort(), l2.sort() if del_dup: l1, l2 = del_dup_list(l1), del_dup_list(l2) res = l1 == l2 s = "Files match" if res else "Files don't match" u.log(s) if not res: f.diff_list(l1, l2, out_path) if err: u.startfile(out_path) assert res is True u.log("[dq] file_match: end") u.log_print()
def gen_query_list(): u.log("Building query list to be input in sql.dowload...") gl.query_var = sql.get_query(gl.QUERY_IN) check_var(gl.query_var) u.log_print(f"Base query:\n{gl.query_var}\n;") elt_list = prepare_elt_list(gl.ar_in) n_grp = math.ceil(len(elt_list) / gl.NB_MAX_ELT_IN_STATEMENT) size_elt_list = math.floor(math.log10(n_grp)) + 1 i, n = 0, 0 cur_elt_list, query_list = [], [] for elt in elt_list: cur_elt_list.append(elt) i += 1 if len(cur_elt_list) % gl.NB_MAX_ELT_IN_STATEMENT == 0: n += 1 n_str = u.extend_str(n, '0', size_elt_list, True) grp = gen_group(cur_elt_list) query_list.append([grp, n_str]) cur_elt_list = [] if len(cur_elt_list) > 0: n += 1 n_str = u.extend_str(n, '0', size_elt_list, True) grp = gen_group(cur_elt_list) query_list.append([grp, n_str]) gl.query_list = query_list log_gen_query_list(elt_list, query_list)
def check_var(query): var = u.g.VAR_DEL + gl.VAR_IN + u.g.VAR_DEL if var not in query: s = f"Error: query must contain {var}" u.log(s) u.log_print("Query:") u.log_print(query) raise Exception(u.g.E_MV)
def finish_xml(out_path, start_time): dstr = u.get_duration_string(start_time) bn = u.big_number(gl.N_WRITE) s = f"[toolParseXML] parse_xml: end ({bn} lines written in {dstr})" u.log(s) u.log_print() if gl.OPEN_OUT_FILE: u.startfile(out_path)
def process_query_init(elt, query, th_nb): if elt == 'MONO': u.log("Executing query:") u.log_print(query + "\n;") elif gl.MAX_DB_CNX == 1: u.log(f"Executing query '{elt}'...") else: u.log(f"Executing query '{elt}' (connection no. {th_nb})...")
def init(kwargs): u.init_kwargs(gl, kwargs) init_globals() u.check_header(gl.IN_PATH) u.log(f"Loading input array from '{gl.IN_PATH}'...") gl.ar_in = u.load_csv(gl.IN_PATH) u.log("Input array loaded") u.log_print('|')
def finish_dq(start_time): (dms, dstr) = u.get_duration_string(start_time, True) s = f"[dq] run_dq: end ({dstr})" u.log(s) if gl.MSG_BOX_END: st.msg_box(s, "dq", dms, gl.MIN_DUR_TRIGGER) u.log_print() if gl.OPEN_OUT_FILE: u.startfile(gl.paths["out"])
def debug(s, key_l, key_r, out_line=[]): if not gl.DEBUG_JOIN: return print(s) print([gl.c_cl + 2, gl.c_cr + 2]) print([key_l, key_r]) print(out_line) u.log_array(gl.out_array) u.log_print()
def init_dq(kwargs): u.log("[dq] run_dq: start") u.init_kwargs(gl, kwargs) init_tmp_dir() set_paths() s = ( f"run_dq job initialised. Input files {gl.paths['in1']} and {gl.paths['in2']}" " are going to be sorted and compared.") u.log(s) u.log_print('|')
def test_utils(): init_PT() msg_box() get_duration() like() u.check_log(cl.CO) u.log_print() pt.cfg.FILES_DIR = back u.g.init_PT(True) u.delete_folder('PT_test_utils/')
def get_duration(): u.log_print("Test string.get_duration", dashes=100) dstr = u.get_duration_string(0, end_time=0.35) u.log(dstr) assert dstr == "350 ms" dstr = u.get_duration_string(0, end_time=5.369) u.log(dstr) assert dstr == "5.3 s" dstr = u.get_duration_string(0, end_time=150) u.log(dstr) assert dstr == "2 minutes and 30 seconds" u.log_print()
def sort_big_file(in_path, out_path): """Sorts a potentially big csv file according to the first column See partools/quickstart/tools_bf.py for examples of use """ from partools.dq.init import init_tmp_dir from partools.dq.sort import sort_big_file reload(dq.gl) # To reinitialise MAX_ROW_LIST value when pytest is run init_tmp_dir() u.log_print() sort_big_file(in_path, out_path, main=True) u.log_print()
def get_bdd_date(cnx): c = cnx.cursor() query = get_iutd_query() u.log("Executing IUTD query: ") u.log_print(query) c.execute(query) u.log("Query executed") out = c.fetchone() out = str(out[0]).replace('-', '/') out = out[:10] return out
def read_tmp_file(tmp_file_path): # Reading one tmp file try: with open(tmp_file_path, 'r', encoding='utf-8') as tmp_file: tmp_file_list = tmp_file.readlines() except FileNotFoundError: tmp_file_list = "empty" except MemoryError: u.log_print(MemoryError) breakpoint() return tmp_file_list
def complete_dict(): for tag in gl.parse_dict: n = len(gl.parse_dict[tag]) if n < gl.N_ROW - 1: gl.parse_dict[tag].append('') elif n >= gl.N_ROW and tag != gl.FIRST_TAG: id = gl.parse_dict[gl.FIRST_TAG][gl.N_ROW - 2] s = (f"Warning: tag '{tag}' appears more than once (id = {id})." " It must be added to MULTI_TAG_LIST.") u.log(s) u.log_print("Execution aborted") sys.exit()
def prompt_dup_key(n_dup_key): u.log_print('|') bn = u.big_number(n_dup_key) s = f"Warning: {bn} different lines with the same research key were identified" u.log(s) u.log_example(gl.dup_key_list) s = ("\nFile comparison may not work correctly. Here are your options:" "\na -> save duplicates list and quit" "\nb -> quit without saving duplicates list" "\nc -> save duplicates list and continue" "\nd -> continue without saving duplicates list") if gl.TEST_PROMPT_DK: u.log_print(s) u.log_print('c (TEST_PROMPT_DK = True)') command = 'c' else: command = u.log_input(s) u.log_print('|') if command == 'a' or command == 'c': u.save_csv(gl.dup_key_list, gl.OUT_DUP_KEY_FILE) s = f"List of key duplicates written in file {gl.OUT_DUP_KEY_FILE}" u.log(s) if command == 'a' or command == 'b': sys.exit()
def check_internal(recipients): sint = gl.INTERNAL_STR u.log(f"Checking if all recipients are internal (ie. contain '{sint}')") not_int = [elt for elt in recipients if sint not in elt] if not_int: if len(not_int) > 1: s = f'Warning: "{not_int}" are not internal email addresses. Send anyways? (y/n)' else: s = f'Warning: "{not_int}" is not an internal email address. Send anyways? (y/n)' if gl.TEST: u.log(s) u.log_print('y (TEST = True)') elif not u.log_input(s) == 'y': sys.exit()
def iutd_file(d_now): if exists(gl.iutd_path): d_old = u.load_txt(gl.iutd_path)[0] if d_now == d_old: gls.iutd = True u.log("IUTD check OK") return True else: u.log_print('|') s = "The date found in the check file doesn't match the current date" u.log(s) return False else: u.log_print('|') u.log("Can't find IUTD check file") return False
def recover(): file_list = u.list_files(gl.TMP_DIR, False) a = len(file_list) if a == 0: return s = "Work in progress detected. Recover? (y/n)" if gl.TEST_RECOVER: u.log(s) u.log_print("y (TEST_RECOVER = True)") elif u.log_input(s) == 'n': u.mkdirs(gl.TMP_DIR, True) return modify_ql(file_list) u.log("Query list modified according previous work in progress. " f"Recovering from query '{gl.QUERY_LIST[0][1]}'.")
def finish(out_path, start_time): u.log("Filtering over") bn1 = u.big_number(gl.n_r) bn2 = u.big_number(gl.n_o) s = (f"{bn1} lines read in the input file and" f" {bn2} lines to be written in the output file") u.log(s) u.log("Writing output file...") u.save_csv(gl.out_list, out_path) s = f"Output file saved in {out_path}" u.log(s) dstr = u.get_duration_string(start_time) u.log(f"[toolFilter] filter: end ({dstr})") u.log_print() if gl.OPEN_OUT_FILE: u.startfile(out_path)
def iutd(): prepare_iutd(gl.INSERT_IUTD_OK) sql.gl.TEST_IUTD = True # Test no iutd file date db ok sql.connect() # Test iutd file date ok sql.connect() u.log_print() os.remove(sql.gl.iutd_path) prepare_iutd(gl.INSERT_IUTD_KO) sql.gl.TEST_IUTD = True # Test no iutd file date db ko sql.connect() # Test iutd file date ko sql.connect() sql.gl.TEST_IUTD = False
def finish(start_time): import partools.utils as u import partools.tools as to import partools.utils.sTools as st if gl.CHECK_DUP: s = "Checking duplicates on the first column of the output file..." u.log(s) to.find_dup(gl.OUT_PATH, col=1) u.log_print('|') (dms, dstr) = u.get_duration_string(start_time, True) s = f"reqlist: end ({dstr})" u.log("[rl] " + s) if gl.MSG_BOX_END: st.msg_box(s, "rl", dms, gl.MIN_DUR_TRIGGER) u.log_print() if gl.OPEN_OUT_FILE: u.startfile(gl.OUT_PATH)
def lauch_threads(): from .connect import gen_cnx_dict if gl.range_query: rg_list = [elt[1] for elt in gl.QUERY_LIST] u.log(f"Ranges to be queried: {rg_list}") thread_list = [] n_cnx = min(gl.MAX_DB_CNX, len(gl.QUERY_LIST)) gen_cnx_dict(n_cnx) for elt in gl.QUERY_LIST: th = Thread(target=process_ql_elt, args=(elt, )) thread_list.append(th) th.start() for th in thread_list: th.join() u.log("All threads are done") u.log_print('|')
def check_recover(): chunk = gl.tmp_file_chunk if os.path.exists(chunk): s = "Injection running detected. Recover? (y/n)" if gl.TEST_RECOVER: u.log(s) u.log_print("y (TEST_RECOVER = True)") elif u.log_input(s) == "n": os.remove(chunk) return False txt = u.load_txt(chunk) try: gl.ref_chunk = int(txt[0]) return True except Exception as e: log.recover_fail(e, chunk, txt) os.remove(chunk) return False
def like(): u.log_print("Test of like functions", dashes=100) s = '2 test ok?' assert u.like(s, 'test') u.log("like simple ok") m = u.like(s, '2 * ok?') assert m.group(1) == 'test' u.log("like m ok") lst = ['1', 'test'] assert u.like_list(s, lst) u.log("like_list ok") dct = {'1': 'a', '2': 'test'} assert u.like_dict(s, dct) == '2' u.log("like_dict ok") u.log_print()
def sort_big_file(in_path, out_path, prompt=False, nb=0, main=False): # Sorts a potentially big csv file according to the first column # the 'nb' input is used to differentiate input file when main run is dq from .init import init_stf from .gstf import gen_sorted_temp_files u.log(f"[dq] sort_file: start ({in_path})") start_time = time() init_stf(in_path, out_path) gen_sorted_temp_files(in_path, out_path) u.log_print('|') nb_files = gl.c_file if nb_files > 1: s = f"Generating sorted output file from {nb_files} sorted temporary files..." u.log(s) merge_sorted_files(out_path) finish(out_path, prompt, nb, start_time) if not main: u.log_print('|')
def split_needed(): n_line = gl.c_out n_out_files = ceil(n_line / gl.MAX_LINE_SPLIT) if n_out_files == 1: return False n_line_2 = n_line + n_out_files - 1 n_out_files = ceil(n_line_2 / gl.MAX_LINE_SPLIT) bn = u.big_number(gl.MAX_LINE_SPLIT) s = (f"Input file has more than {bn} lines." f" It will be split in {n_out_files} files " f"(max file nb set to {gl.MAX_FILE_NB_SPLIT}). Continue? (y/n)") if gl.TEST_PROMPT_SPLIT: u.log(s) u.log_print('y (TEST_PROMPT_SPLIT = True)') return True if u.log_input(s) == "n": sys.exit() return True