def left_join_arrays(ar_left_in, ar_right_in): check_void_right_array(ar_right_in) u.log("Preparing left array...") (ar_left, first_line_l) = prepare_array(ar_left_in) u.save_csv(ar_left, gl.OUT_LEFT) log_prepare(gl.OUT_LEFT, u.big_number(len(ar_left))) u.log("Preparing right array...") (ar_right, first_line_r) = prepare_array(ar_right_in) u.save_csv(ar_right, gl.OUT_RIGHT) log_prepare(gl.OUT_RIGHT, u.big_number(len(ar_right))) u.log("Joining both arrays...") init_while_join(first_line_l, first_line_r) while gl.END_LEFT is False or gl.END_RIGHT is False: (key_l, key_r) = update_key(ar_left, ar_right) key_l = compare_inf(key_l, key_r, ar_left) (key_l, key_r) = compare_sup(key_l, key_r, ar_left, ar_right) key_r = compare_equal(key_l, key_r, ar_left, ar_right) if incr_c_l(ar_left): break bn = u.big_number(len(gl.out_array)) s = f"Output array generated. It has {bn} lines (including header)." u.log(s)
def finish(out_path, prompt, nb, start_time): n_dup_key = len(gl.dup_key_list) n_dup = len(gl.dup_list) bn1 = u.big_number(gl.c_tot_out) bn2 = u.big_number(n_dup) s = (f"Output file {out_path} successfully generated" f" ({bn1} lines written, {bn2} pure duplicates removed).") u.log(s) if n_dup > 0: if nb != 0: out_dup = gl.OUT_DUP_FILE + str(nb) + gl.FILE_TYPE else: out_dup = gl.OUT_DUP_FILE + gl.FILE_TYPE u.save_csv(gl.dup_list, out_dup) u.log(f"Pure duplicates list written in {out_dup}") u.log_example(gl.dup_list, "pure duplicates") if n_dup_key > 0: if prompt: prompt_dup_key(n_dup_key) else: u.save_csv(gl.dup_key_list, gl.OUT_DUP_KEY_FILE) s = f"{n_dup_key} key duplicates found. List written in {gl.OUT_DUP_KEY_FILE}" u.log(s) dstr = u.get_duration_string(start_time) u.log(f"[dq] sort_file: end ({dstr})")
def gen_last_file(out_path): # Generation of the last temporary file gl.c_file += 1 if gl.c_file == 1: bn = u.big_number(gl.c_sf_read) s = (f"Input file entirely read ({bn} lines)." " Sorting current list...") u.log(s) gl.cur_list.sort() s = "Current list sorted. Generating output file..." u.log(s) gen_out_file(out_path) s = f"Output file saved in {out_path}" u.log(s) else: if len(gl.cur_list) > 0: s = ("Input file entirely read ({} lines)." " Sorting last current list...") u.log(s.format(u.big_number(gl.c_sf_read))) gl.cur_list.sort() s = ("Last current list sorted. Generating last temporary file" f" (no. {gl.c_file})...") u.log(s.format()) gen_temp_file() s = "Temporary file successfully generated" u.log(s) else: gl.c_file -= 1 u.log(f"{gl.c_file} temporary files created")
def log_gen_query_list(elt_list, group_list): bn1 = u.big_number(len(elt_list)) bn2 = u.big_number(len(group_list)) s = ( f"Query list built: {bn1} elements to be processed distributed" f" in {bn2} groups ({gl.NB_MAX_ELT_IN_STATEMENT} max per group)." f" They will be processed in parallel by {gl.MAX_DB_CNX} connection pools." ) u.log(s)
def finish(out_path): nb_out = u.big_number(gl.c_out) nb_1 = u.big_number(gl.c_1) nb_2 = u.big_number(gl.c_2) s = (f"Output file successfully generated in {out_path}\n" f"\t\t{nb_1} lines read in file 1\n" f"\t\t{nb_2} lines read in file 2\n" f"\t\t{nb_out} lines written in output file") u.log(s)
def found_msg(i, j): gl.c_row = i bni = u.big_number(i) bn = u.big_number(gl.c_main) if gl.LINE_PER_LINE: s = (f"String found in line no. {bni} of list no. {gl.c_list}" f" (global line no. {bn}) in col {j + 1}!") else: s = (f"String found in buffer no. {bn}" f" (buffer list no. {gl.c_list}) in col {j + 1}!") u.log(s)
def prompt_dup_key(n_dup_key): u.log_print('|') bn = u.big_number(n_dup_key) s = f"Warning: {bn} different lines with the same research key were identified" u.log(s) u.log_example(gl.dup_key_list) s = ("\nFile comparison may not work correctly. Here are your options:" "\na -> save duplicates list and quit" "\nb -> quit without saving duplicates list" "\nc -> save duplicates list and continue" "\nd -> continue without saving duplicates list") if gl.TEST_PROMPT_DK: u.log_print(s) u.log_print('c (TEST_PROMPT_DK = True)') command = 'c' else: command = u.log_input(s) u.log_print('|') if command == 'a' or command == 'c': u.save_csv(gl.dup_key_list, gl.OUT_DUP_KEY_FILE) s = f"List of key duplicates written in file {gl.OUT_DUP_KEY_FILE}" u.log(s) if command == 'a' or command == 'b': sys.exit()
def log_prepare(ar, bn_ar): n_dup = len(gl.dup_list) bn_dup = u.big_number(n_dup) s = f"Array prepared and saved in {ar} ({bn_ar} lines, {bn_dup} duplicates dismissed)" u.log(s) u.log_example(gl.dup_list)
def finish_del_dup(out_list, out_path, open_out): u.log(f"Saving list without duplicates in '{out_path}'...") u.save_list(out_list, out_path) bn_out = u.big_number(len(out_list)) u.log(f"List saved, it has {bn_out} lines") if open_out: u.startfile(out_path)
def finish_this(start_time): gl.cnx.close() os.remove(gl.tmp_file_chunk) bn = u.big_number(gl.c_main) dstr = u.get_duration_string(start_time) u.log(f"{bn} lines exported") u.log(f"[sql] upload: end ({dstr})")
def finish(out_path, start_time): u.log("Filtering over") bn1 = u.big_number(gl.n_r) bn2 = u.big_number(gl.n_o) s = (f"{bn1} lines read in the input file and" f" {bn2} lines to be written in the output file") u.log(s) u.log("Writing output file...") u.save_csv(gl.out_list, out_path) s = f"Output file saved in {out_path}" u.log(s) dstr = u.get_duration_string(start_time) u.log(f"[toolFilter] filter: end ({dstr})") u.log_print() if gl.OPEN_OUT_FILE: u.startfile(out_path)
def inject(): s1 = "Injecting data in DB" if gl.ref_chunk != 0: bn = u.big_number(gl.ref_chunk * gl.NB_MAX_ELT_INSERT) s = s1 + f" (recovering from line {bn})" else: s = s1 s += "..." u.log(s)
def finish_xml(out_path, start_time): dstr = u.get_duration_string(start_time) bn = u.big_number(gl.N_WRITE) s = f"[toolParseXML] parse_xml: end ({bn} lines written in {dstr})" u.log(s) u.log_print() if gl.OPEN_OUT_FILE: u.startfile(out_path)
def write_rows_finish(q_name, i, cnx_nb): bn = u.big_number(i) if q_name == 'MONO': return elif gl.MAX_DB_CNX == 1 or cnx_nb == 0: s = f"All lines written for query '{q_name}' ({bn} lines written)" u.log(s) else: s = (f"All lines written for query '{q_name}'" f" ({bn} lines written, connection no. {cnx_nb})") u.log(s)
def finish_find_dup(dup_list, out_path, open_out): n = len(dup_list) if n == 0: u.log("No duplicates found") return bn = u.big_number(len(dup_list)) u.log(f"{bn} duplicates found") u.log_example(dup_list) u.save_csv(dup_list, out_path) u.log(f"List of duplicates saved in {out_path}") if open_out: u.startfile(out_path)
def del_dup(in_path, out_path, open_out=False): """Deletes the duplicates in in_path file""" from .finish import finish_del_dup u.log("[toolDup] del_dup: start") u.log(f"Deleting duplicates in file '{in_path}'...") cur_list = u.load_txt(in_path) bn = u.big_number(len(cur_list)) u.log(f"File loaded, {bn} lines to be analysed") if u.has_header(cur_list): out_list = [cur_list[0]] + del_dup_list(cur_list[1:]) else: out_list = del_dup_list(cur_list) finish_del_dup(out_list, out_path, open_out) u.log("[toolDup] del_dup: end")
def find_dup(in_path, out_path='', open_out=False, col=0): """Finds the duplicates in in_path file - col: if the file is a csv, the duplicates will be searched in this column index """ from .init import init_find_dup from .finish import finish_find_dup u.log("[toolDup] find_dup: start") (cur_list, out_path) = init_find_dup(in_path, out_path, col) bn = u.big_number(len(cur_list)) u.log(f"File loaded, {bn} lines to be analysed") dup_list = find_dup_list(cur_list) finish_find_dup(dup_list, out_path, open_out) u.log("[toolDup] find_dup: end")
def search_cur_list(): s = f"Temp list no. {gl.c_list} search" u.log(s, 1) i = 0 for elt in gl.cur_list: i += 1 gl.c_main += 1 j = elt.find(gl.LOOK_FOR) if j != -1: found_msg(i, j) gl.FOUND = True return True bn = u.big_number(gl.c_main) s = (f"Temp list no. {gl.c_list} search over, string not found" f" ({bn} lines read in total)") u.log(s, 1) return False
def goto_eof(in_file): cur_list = [] line = read_file(in_file) cur_list.append(line) while line != "": line = read_file(in_file) cur_list.append(line.strip("\n")) if len(cur_list) > gl.N_READ + 1: del cur_list[0] u.log_array(cur_list) bn = u.big_number(gl.c_main - 1) if gl.LINE_PER_LINE: s = f"EOF reached. {bn} lines read." else: s = f"EOF reached. {bn} buffers of {gl.BUFFER_SIZE} characters read." u.log(s)
def insert(script): if gl.c_chunk >= gl.ref_chunk: gl.data = [tuple(line) for line in gl.data] gl.c.executemany(script, gl.data) gl.c_chunk += 1 snc = str(gl.c_chunk) u.save_csv([f"{snc}_COMMIT_RUNNING"], gl.tmp_file_chunk) gl.cnx.commit() u.save_csv([snc], gl.tmp_file_chunk) sn = u.big_number(gl.c_main) u.log(f"{sn} lines inserted in total") gl.c.close() gl.c = gl.cnx.cursor() else: gl.c_chunk += 1 gl.data = []
def compare_files(in_1, in_2, out_path): from .csf import compare_sorted_files u.log("[dq] compare_files: start") start_time = time() u.gen_header(in_1, gl.COMPARE_FIELD, out_path) compare_sorted_files(in_1, in_2, out_path) if gl.c_diff == 0: u.log("Files match") out = True else: bn = u.big_number(gl.c_diff) u.log(f"{bn} differences found") out = False dstr = u.get_duration_string(start_time) u.log(f"[dq] compare_files: end ({dstr})") return out
def split_needed(): n_line = gl.c_out n_out_files = ceil(n_line / gl.MAX_LINE_SPLIT) if n_out_files == 1: return False n_line_2 = n_line + n_out_files - 1 n_out_files = ceil(n_line_2 / gl.MAX_LINE_SPLIT) bn = u.big_number(gl.MAX_LINE_SPLIT) s = (f"Input file has more than {bn} lines." f" It will be split in {n_out_files} files " f"(max file nb set to {gl.MAX_FILE_NB_SPLIT}). Continue? (y/n)") if gl.TEST_PROMPT_SPLIT: u.log(s) u.log_print('y (TEST_PROMPT_SPLIT = True)') return True if u.log_input(s) == "n": sys.exit() return True
def init_equal_diff_bool(): if gl.EQUAL_OUT: if gl.c_sf_read <= gl.MAX_ROW_EQUAL_OUT: gl.EQUAL = True gl.DIFF = gl.DIFF_OUT else: bn = u.big_number(gl.MAX_ROW_EQUAL_OUT) s = (f"Warning: file to be compared have more than {bn} lines" " and EQUAL_OUT paramter is set to True.\n" "Do you want to write matching lines in output file ? (y/n)") if u.log_input(s) == "y": gl.EQUAL = True gl.DIFF = gl.DIFF_OUT else: gl.EQUAL = False gl.DIFF = True else: gl.EQUAL = False gl.DIFF = True
def finish_sbf(out_path, start_time): if gl.FOUND: lowI = gl.c_row - 1 - gl.PRINT_SIZE // 2 if lowI < 0: lowI = 0 highI = gl.c_row - 1 + gl.PRINT_SIZE // 2 u.save_list(gl.cur_list[lowI:highI], out_path) s = f"Current list written in {out_path}" u.log(s.format()) if gl.OPEN_OUT_FILE: u.startfile(out_path) else: bn = u.big_number(gl.c_main) s = (f"EOF reached ({bn} lines, {gl.c_list} temporary lists)" f", string '{gl.LOOK_FOR}' not found") u.log(s) dstr = u.get_duration_string(start_time) u.log(f"[toolBF] search_big_file: end ({dstr})\n")
def check_max_row(counter): # It is checked whether max number of lines of cur_list is not more than # fixed limit in module (MAX_ROW_LIST) gl to avoid a memory error if counter % gl.MAX_ROW_LIST == 0: gl.c_file += 1 bn = u.big_number(gl.MAX_ROW_LIST) list_nb = gl.c_file s = (f"Maximum number of lines reached ({bn} lines) for list" f" no. {list_nb}, sorting...") u.log(s) gl.cur_list.sort() tmp_nb = gl.c_file s = ("Current list sorted. Generating temporary file" f" no. {tmp_nb}...") u.log(s.format()) gen_temp_file() s = "Temporary file successfully generated, input file reading goes on..." u.log(s) del gl.cur_list gl.cur_list = []