Exemplo n.º 1
0
def read_big_file(in_path, **kwargs):
    """Reads a potentially big file

    See in partools/tools/gl for other parameters (kwargs)

    See partools/quickstart/tools_bf.py for examples of use
    """
    from .init import init_rbf

    u.log("[toolBF] read_big_file: start")
    init_rbf()
    u.init_kwargs(gl, kwargs)
    with open(in_path, 'r', encoding='utf-8', errors='ignore') as in_file:
        line = f.read_file(in_file)
        u.log_print(line.strip("\n"))
        while line != "":
            line = f.read_file(in_file)
            u.log_print(line.strip("\n"))
            gl.c_read += 1
            if f.check_counter(in_file):
                continue
            else:
                break

    u.log("[toolBF] read_big_file: end\n")
Exemplo n.º 2
0
def left_join_files(lpath='', rpath='', out='', debug=False):
    """Joints two files (lpath and rpath) on the first column of each file"""
    from .init import init_globals
    from .join import left_join_arrays

    u.log("[rl] left_join_files: start")
    start_time = time()
    if debug:
        gl.DEBUG_JOIN = True
    if lpath or rpath:
        init_globals()
        u.log(f"Loading arrays from '{lpath}' and '{rpath}'...")
        gl.ar_in = u.load_csv(lpath)
        ar_right = u.load_csv(rpath)
        u.log("Arrays loaded")
        u.log_print('|')
    else:
        u.log("Loading right arrays...")
        ar_right = u.load_csv(gl.OUT_SQL)
        u.log("Right array loaded")
    left_join_arrays(gl.ar_in, ar_right)
    if not out:
        out = gl.OUT_PATH
    u.log("Saving output file...")
    u.save_csv(gl.out_array, out)
    s = f"Output file saved in {out}"
    u.log(s)
    dstr = u.get_duration_string(start_time)
    u.log(f"[rl] left_join_files: end ({dstr})")
    u.log_print('|')
Exemplo n.º 3
0
def prepare_bdd():
    from .execute import execute

    if gl.EXECUTE_KWARGS:
        u.log("Preparing DB before data injection...")
        u.log_print("|")
        execute(**gl.EXECUTE_KWARGS)
Exemplo n.º 4
0
def file_match(in1, in2, del_dup=False, err=True, out_path=''):
    """Compares two files and outputs the diff if the files don't match.
    Note that the files are sorted before comparison.
    (more generic than run_dq but doesn't work for big files)

    - del_dup: if true, duplicates are deleted before comparison
    - err: if True, an exception is raised when the files don't match
    - out_path: specifies an output path for file comparison different from default
    """

    u.log("[dq] file_match: start")

    if not out_path:
        out_path = u.g.dirs['OUT'] + 'file_match_out.csv'

    s = f"Comparing files '{in1}' and '{in2}'..."
    u.log(s)
    l1, l2 = u.load_txt(in1), u.load_txt(in2)
    l1.sort(), l2.sort()
    if del_dup:
        l1, l2 = del_dup_list(l1), del_dup_list(l2)

    res = l1 == l2
    s = "Files match" if res else "Files don't match"
    u.log(s)

    if not res:
        f.diff_list(l1, l2, out_path)
        if err:
            u.startfile(out_path)
            assert res is True

    u.log("[dq] file_match: end")
    u.log_print()
Exemplo n.º 5
0
def gen_query_list():
    u.log("Building query list to be input in sql.dowload...")

    gl.query_var = sql.get_query(gl.QUERY_IN)
    check_var(gl.query_var)
    u.log_print(f"Base query:\n{gl.query_var}\n;")

    elt_list = prepare_elt_list(gl.ar_in)
    n_grp = math.ceil(len(elt_list) / gl.NB_MAX_ELT_IN_STATEMENT)
    size_elt_list = math.floor(math.log10(n_grp)) + 1
    i, n = 0, 0
    cur_elt_list, query_list = [], []
    for elt in elt_list:
        cur_elt_list.append(elt)
        i += 1
        if len(cur_elt_list) % gl.NB_MAX_ELT_IN_STATEMENT == 0:
            n += 1
            n_str = u.extend_str(n, '0', size_elt_list, True)
            grp = gen_group(cur_elt_list)
            query_list.append([grp, n_str])
            cur_elt_list = []
    if len(cur_elt_list) > 0:
        n += 1
        n_str = u.extend_str(n, '0', size_elt_list, True)
        grp = gen_group(cur_elt_list)
        query_list.append([grp, n_str])

    gl.query_list = query_list
    log_gen_query_list(elt_list, query_list)
Exemplo n.º 6
0
def check_var(query):
    var = u.g.VAR_DEL + gl.VAR_IN + u.g.VAR_DEL
    if var not in query:
        s = f"Error: query must contain {var}"
        u.log(s)
        u.log_print("Query:")
        u.log_print(query)
        raise Exception(u.g.E_MV)
Exemplo n.º 7
0
def finish_xml(out_path, start_time):

    dstr = u.get_duration_string(start_time)
    bn = u.big_number(gl.N_WRITE)
    s = f"[toolParseXML] parse_xml: end ({bn} lines written in {dstr})"
    u.log(s)
    u.log_print()
    if gl.OPEN_OUT_FILE:
        u.startfile(out_path)
Exemplo n.º 8
0
def process_query_init(elt, query, th_nb):

    if elt == 'MONO':
        u.log("Executing query:")
        u.log_print(query + "\n;")
    elif gl.MAX_DB_CNX == 1:
        u.log(f"Executing query '{elt}'...")
    else:
        u.log(f"Executing query '{elt}' (connection no. {th_nb})...")
Exemplo n.º 9
0
def init(kwargs):

    u.init_kwargs(gl, kwargs)
    init_globals()
    u.check_header(gl.IN_PATH)
    u.log(f"Loading input array from '{gl.IN_PATH}'...")
    gl.ar_in = u.load_csv(gl.IN_PATH)
    u.log("Input array loaded")
    u.log_print('|')
Exemplo n.º 10
0
def finish_dq(start_time):

    (dms, dstr) = u.get_duration_string(start_time, True)
    s = f"[dq] run_dq: end ({dstr})"
    u.log(s)
    if gl.MSG_BOX_END:
        st.msg_box(s, "dq", dms, gl.MIN_DUR_TRIGGER)
    u.log_print()
    if gl.OPEN_OUT_FILE:
        u.startfile(gl.paths["out"])
Exemplo n.º 11
0
def debug(s, key_l, key_r, out_line=[]):
    if not gl.DEBUG_JOIN:
        return

    print(s)
    print([gl.c_cl + 2, gl.c_cr + 2])
    print([key_l, key_r])
    print(out_line)
    u.log_array(gl.out_array)
    u.log_print()
Exemplo n.º 12
0
def init_dq(kwargs):
    u.log("[dq] run_dq: start")
    u.init_kwargs(gl, kwargs)
    init_tmp_dir()
    set_paths()
    s = (
        f"run_dq job initialised. Input files {gl.paths['in1']} and {gl.paths['in2']}"
        " are going to be sorted and compared.")
    u.log(s)
    u.log_print('|')
Exemplo n.º 13
0
def test_utils():
    init_PT()
    msg_box()
    get_duration()
    like()
    u.check_log(cl.CO)
    u.log_print()

    pt.cfg.FILES_DIR = back
    u.g.init_PT(True)
    u.delete_folder('PT_test_utils/')
Exemplo n.º 14
0
def get_duration():
    u.log_print("Test string.get_duration", dashes=100)
    dstr = u.get_duration_string(0, end_time=0.35)
    u.log(dstr)
    assert dstr == "350 ms"
    dstr = u.get_duration_string(0, end_time=5.369)
    u.log(dstr)
    assert dstr == "5.3 s"
    dstr = u.get_duration_string(0, end_time=150)
    u.log(dstr)
    assert dstr == "2 minutes and 30 seconds"
    u.log_print()
Exemplo n.º 15
0
def sort_big_file(in_path, out_path):
    """Sorts a potentially big csv file according to the first column

    See partools/quickstart/tools_bf.py for examples of use
    """
    from partools.dq.init import init_tmp_dir
    from partools.dq.sort import sort_big_file

    reload(dq.gl)  # To reinitialise MAX_ROW_LIST value when pytest is run
    init_tmp_dir()
    u.log_print()
    sort_big_file(in_path, out_path, main=True)
    u.log_print()
Exemplo n.º 16
0
def get_bdd_date(cnx):

    c = cnx.cursor()
    query = get_iutd_query()
    u.log("Executing IUTD query: ")
    u.log_print(query)
    c.execute(query)
    u.log("Query executed")
    out = c.fetchone()
    out = str(out[0]).replace('-', '/')
    out = out[:10]

    return out
Exemplo n.º 17
0
def read_tmp_file(tmp_file_path):
    # Reading one tmp file

    try:
        with open(tmp_file_path, 'r', encoding='utf-8') as tmp_file:
            tmp_file_list = tmp_file.readlines()
    except FileNotFoundError:
        tmp_file_list = "empty"
    except MemoryError:
        u.log_print(MemoryError)
        breakpoint()

    return tmp_file_list
Exemplo n.º 18
0
def complete_dict():

    for tag in gl.parse_dict:
        n = len(gl.parse_dict[tag])
        if n < gl.N_ROW - 1:
            gl.parse_dict[tag].append('')
        elif n >= gl.N_ROW and tag != gl.FIRST_TAG:
            id = gl.parse_dict[gl.FIRST_TAG][gl.N_ROW - 2]
            s = (f"Warning: tag '{tag}' appears more than once (id = {id})."
                 " It must be added to MULTI_TAG_LIST.")
            u.log(s)
            u.log_print("Execution aborted")
            sys.exit()
Exemplo n.º 19
0
def prompt_dup_key(n_dup_key):

    u.log_print('|')
    bn = u.big_number(n_dup_key)
    s = f"Warning: {bn} different lines with the same research key were identified"
    u.log(s)
    u.log_example(gl.dup_key_list)

    s = ("\nFile comparison may not work correctly. Here are your options:"
         "\na -> save duplicates list and quit"
         "\nb -> quit without saving duplicates list"
         "\nc -> save duplicates list and continue"
         "\nd -> continue without saving duplicates list")
    if gl.TEST_PROMPT_DK:
        u.log_print(s)
        u.log_print('c (TEST_PROMPT_DK = True)')
        command = 'c'
    else:
        command = u.log_input(s)
    u.log_print('|')
    if command == 'a' or command == 'c':
        u.save_csv(gl.dup_key_list, gl.OUT_DUP_KEY_FILE)
        s = f"List of key duplicates written in file {gl.OUT_DUP_KEY_FILE}"
        u.log(s)
    if command == 'a' or command == 'b':
        sys.exit()
Exemplo n.º 20
0
def check_internal(recipients):

    sint = gl.INTERNAL_STR
    u.log(f"Checking if all recipients are internal (ie. contain '{sint}')")
    not_int = [elt for elt in recipients if sint not in elt]
    if not_int:
        if len(not_int) > 1:
            s = f'Warning: "{not_int}" are not internal email addresses. Send anyways? (y/n)'
        else:
            s = f'Warning: "{not_int}" is not an internal email address. Send anyways? (y/n)'

        if gl.TEST:
            u.log(s)
            u.log_print('y (TEST = True)')
        elif not u.log_input(s) == 'y':
            sys.exit()
Exemplo n.º 21
0
def iutd_file(d_now):
    if exists(gl.iutd_path):
        d_old = u.load_txt(gl.iutd_path)[0]
        if d_now == d_old:
            gls.iutd = True
            u.log("IUTD check OK")
            return True
        else:
            u.log_print('|')
            s = "The date found in the check file doesn't match the current date"
            u.log(s)
            return False
    else:
        u.log_print('|')
        u.log("Can't find IUTD check file")
        return False
Exemplo n.º 22
0
def recover():

    file_list = u.list_files(gl.TMP_DIR, False)
    a = len(file_list)
    if a == 0:
        return

    s = "Work in progress detected. Recover? (y/n)"
    if gl.TEST_RECOVER:
        u.log(s)
        u.log_print("y (TEST_RECOVER = True)")
    elif u.log_input(s) == 'n':
        u.mkdirs(gl.TMP_DIR, True)
        return

    modify_ql(file_list)
    u.log("Query list modified according previous work in progress. "
          f"Recovering from query '{gl.QUERY_LIST[0][1]}'.")
Exemplo n.º 23
0
def finish(out_path, start_time):

    u.log("Filtering over")
    bn1 = u.big_number(gl.n_r)
    bn2 = u.big_number(gl.n_o)
    s = (f"{bn1} lines read in the input file and"
         f" {bn2} lines to be written in the output file")
    u.log(s)

    u.log("Writing output file...")
    u.save_csv(gl.out_list, out_path)
    s = f"Output file saved in {out_path}"
    u.log(s)
    dstr = u.get_duration_string(start_time)
    u.log(f"[toolFilter] filter: end ({dstr})")
    u.log_print()
    if gl.OPEN_OUT_FILE:
        u.startfile(out_path)
Exemplo n.º 24
0
def iutd():
    prepare_iutd(gl.INSERT_IUTD_OK)
    sql.gl.TEST_IUTD = True

    # Test no iutd file date db ok
    sql.connect()
    # Test iutd file date ok
    sql.connect()

    u.log_print()
    os.remove(sql.gl.iutd_path)
    prepare_iutd(gl.INSERT_IUTD_KO)
    sql.gl.TEST_IUTD = True

    # Test no iutd file date db ko
    sql.connect()
    # Test iutd file date ko
    sql.connect()
    sql.gl.TEST_IUTD = False
Exemplo n.º 25
0
def finish(start_time):
    import partools.utils as u
    import partools.tools as to
    import partools.utils.sTools as st

    if gl.CHECK_DUP:
        s = "Checking duplicates on the first column of the output file..."
        u.log(s)
        to.find_dup(gl.OUT_PATH, col=1)
        u.log_print('|')

    (dms, dstr) = u.get_duration_string(start_time, True)
    s = f"reqlist: end ({dstr})"
    u.log("[rl] " + s)
    if gl.MSG_BOX_END:
        st.msg_box(s, "rl", dms, gl.MIN_DUR_TRIGGER)
    u.log_print()
    if gl.OPEN_OUT_FILE:
        u.startfile(gl.OUT_PATH)
Exemplo n.º 26
0
def lauch_threads():
    from .connect import gen_cnx_dict

    if gl.range_query:
        rg_list = [elt[1] for elt in gl.QUERY_LIST]
        u.log(f"Ranges to be queried: {rg_list}")
    thread_list = []
    n_cnx = min(gl.MAX_DB_CNX, len(gl.QUERY_LIST))
    gen_cnx_dict(n_cnx)
    for elt in gl.QUERY_LIST:
        th = Thread(target=process_ql_elt, args=(elt, ))
        thread_list.append(th)
        th.start()

    for th in thread_list:
        th.join()

    u.log("All threads are done")
    u.log_print('|')
Exemplo n.º 27
0
def check_recover():

    chunk = gl.tmp_file_chunk
    if os.path.exists(chunk):
        s = "Injection running detected. Recover? (y/n)"
        if gl.TEST_RECOVER:
            u.log(s)
            u.log_print("y (TEST_RECOVER = True)")
        elif u.log_input(s) == "n":
            os.remove(chunk)
            return False

        txt = u.load_txt(chunk)
        try:
            gl.ref_chunk = int(txt[0])
            return True
        except Exception as e:
            log.recover_fail(e, chunk, txt)
            os.remove(chunk)
            return False
Exemplo n.º 28
0
def like():

    u.log_print("Test of like functions", dashes=100)

    s = '2 test ok?'
    assert u.like(s, 'test')
    u.log("like simple ok")

    m = u.like(s, '2 * ok?')
    assert m.group(1) == 'test'
    u.log("like m ok")

    lst = ['1', 'test']
    assert u.like_list(s, lst)
    u.log("like_list ok")

    dct = {'1': 'a', '2': 'test'}
    assert u.like_dict(s, dct) == '2'
    u.log("like_dict ok")
    u.log_print()
Exemplo n.º 29
0
def sort_big_file(in_path, out_path, prompt=False, nb=0, main=False):
    # Sorts a potentially big csv file according to the first column
    # the 'nb' input is used to differentiate input file when main run is dq

    from .init import init_stf
    from .gstf import gen_sorted_temp_files

    u.log(f"[dq] sort_file: start ({in_path})")
    start_time = time()
    init_stf(in_path, out_path)
    gen_sorted_temp_files(in_path, out_path)
    u.log_print('|')
    nb_files = gl.c_file
    if nb_files > 1:
        s = f"Generating sorted output file from {nb_files} sorted temporary files..."
        u.log(s)
        merge_sorted_files(out_path)
    finish(out_path, prompt, nb, start_time)
    if not main:
        u.log_print('|')
Exemplo n.º 30
0
def split_needed():
    n_line = gl.c_out
    n_out_files = ceil(n_line / gl.MAX_LINE_SPLIT)
    if n_out_files == 1:
        return False

    n_line_2 = n_line + n_out_files - 1
    n_out_files = ceil(n_line_2 / gl.MAX_LINE_SPLIT)
    bn = u.big_number(gl.MAX_LINE_SPLIT)
    s = (f"Input file has more than {bn} lines."
         f" It will be split in {n_out_files} files "
         f"(max file nb set to {gl.MAX_FILE_NB_SPLIT}). Continue? (y/n)")
    if gl.TEST_PROMPT_SPLIT:
        u.log(s)
        u.log_print('y (TEST_PROMPT_SPLIT = True)')
        return True
    if u.log_input(s) == "n":
        sys.exit()

    return True