Exemplo n.º 1
0
def _process_dict_data(ctx):
    logger = Logger("_process_dict_data")
    logger.info("ETL process has begun")

    interactive = ctx.obj['interactive']
    data = ctx.obj['data']
    clean = ctx.obj['clean']
    rules = ctx.obj['rules']
    bclink_helpers = ctx.obj['bclink_helpers']

    bclink_helpers.print_summary()

    #calculate the amount of time to wait before checking for changes
    tdelta = None
    if 'watch' in data:
        watch = data['watch']
        tdelta = datetime.timedelta(**watch)

    #get the input folder to watch
    input_folder = data['input']
    #get the root output folder
    output_folder = data['output']

    i = 0
    while True:
        #find subfolders containing data dumps
        subfolders = coconnect.tools.get_subfolders(input_folder)
        # if len(subfolders)>0:
        #     logger.info(f"Found {len(subfolders)} subfolders at path '{input_folder}'")
        # if interactive and len(subfolders)>0:
        #     questions = [
        #         inquirer.Checkbox('folders',
        #                           message="Confirm processing the following subfolders.. ",
        #                           choices=subfolders,
        #                           default=subfolders
        #                           )
        #         ]
        #     answers = inquirer.prompt(questions)
        #     if answers == None:
        #         os.kill(os.getpid(), signal.SIGINT)

        #     subfolders = {k:v for k,v in subfolders.items() if k in answers['folders']}
        #     logger.info(f"selected {subfolders}")

        logger.debug(
            f"Found and checking {len(subfolders.values())} subfolders")
        logger.debug(list(subfolders.values()))

        if len(subfolders.values()) > 0:
            logger.debug(f"{list(subfolders.values())}")

        njobs = 0
        #print (reversed(sorted(subfolders.items(),key=lambda x: os.path.getmtime(x[1]))))
        for name, path in sorted(subfolders.items(),
                                 key=lambda x: os.path.getmtime(x[1])):
            output_folder_exists = os.path.exists(f"{output_folder}/{name}")

            inputs = coconnect.tools.get_files(path, type='csv')
            filtered_rules = coconnect.tools.remove_missing_sources_from_rules(
                rules, inputs)

            if output_folder_exists:
                output_tables = [
                    os.path.splitext(os.path.basename(x))[0]
                    for x in coconnect.tools.get_files(
                        f"{output_folder}/{name}", type='tsv')
                ]

                expected_outputs = list(filtered_rules['cdm'].keys())
                to_process = list(set(expected_outputs) - set(output_tables))

                if len(to_process) == 0:
                    continue

                filtered_rules = coconnect.tools.filter_rules_by_destination_tables(
                    filtered_rules, to_process)

            logger.debug(f"New data found!")
            logger.info(f"Creating a new task for processing {path}")

            if len(inputs) == 0:
                logger.critical(f"Subfolder contains no .csv files!")
                continue

            tables = list(filtered_rules['cdm'].keys())
            logger.debug(f'inputs: {inputs}')
            logger.info(f'cdm tables: {tables}')

            _data = copy.deepcopy(data)
            _data['input'] = inputs
            _data['output'] = f"{output_folder}/{name}"

            _execute(ctx,
                     data=_data,
                     rules=filtered_rules,
                     clean=clean if (i == 0 and njobs == 0) else False)
            njobs += 1

        if tdelta is None:
            break

        if njobs > 0 or i == 0:
            logger.info(
                f"Refreshing {input_folder} every {tdelta} to look for new subfolders...."
            )
            if len(subfolders.values()) == 0:
                logger.warning("No subfolders for data dumps yet found...")

        i += 1
        time.sleep(tdelta.total_seconds())
Exemplo n.º 2
0
def _process_list_data(ctx):
    logger = Logger("_process_list_data")
    logger.info("ETL process has begun")

    interactive = ctx.obj['interactive']
    data = []
    clean = ctx.obj['clean']
    rules = ctx.obj['rules']
    bclink_helpers = ctx.obj['bclink_helpers']
    config_file = ctx.obj['conf']
    conf = _load_config(config_file)
    rules_file = conf['rules']
    rules_file_last_modified = os.path.getmtime(rules_file)

    bclink_helpers.print_summary()
    display_msg = True
    _clean = clean

    while True:

        re_execute = False
        try:
            conf = _load_config(config_file)
        except Exception as e:
            if not display_msg:
                logger.critical(e)
                logger.error(
                    f"You've misconfigured your file '{config_file}'!! Please fix!"
                )
            time.sleep(5)
            display_msg = True
            continue

        current_rules_file = conf['rules']
        new_rules_file = rules_file != current_rules_file
        if new_rules_file:
            #if there's a new rules file
            logger.info(
                f"Detected a new rules file.. old was '{rules_file}' and new is '{current_rules_file}'"
            )
            rules_file = current_rules_file
            rules = coconnect.tools.load_json_delta(rules_file, rules)
            rules_file_last_modified = os.path.getmtime(rules_file)
            re_execute = True
        else:
            #otherwise check for changes in the existing file
            new_rules_file_last_modified = os.path.getmtime(current_rules_file)
            change_in_rules = rules_file_last_modified != new_rules_file_last_modified
            if change_in_rules:
                logger.info(
                    f"Detected a change/update in the rules file '{rules_file}'"
                )
                rules = coconnect.tools.load_json_delta(
                    current_rules_file, rules)
                re_execute = True

        current_data = conf['data']
        if not data == current_data:
            logger.debug(f"old {data}")
            logger.debug(f"new {current_data}")
            new_data = [obj for obj in current_data if obj not in data]
            logger.info(f"New data found! {new_data}")
            re_execute = True
        else:
            new_data = data

        logger.debug(f"re-execute {re_execute}")
        if re_execute:
            current_data = copy.deepcopy(new_data)
            #loop over any new data
            for item in new_data:
                if isinstance(item['input'], list):
                    inputs = item['input']
                else:
                    input_folder = item['input']
                    if not os.path.isdir(input_folder):
                        raise Exception(
                            f"{input_folder} is not a directory containing files!"
                        )
                    inputs = coconnect.tools.get_files(input_folder,
                                                       type='csv')
                filtered_rules = coconnect.tools.remove_missing_sources_from_rules(
                    rules, inputs)

                _execute(ctx, data=item, rules=filtered_rules, clean=_clean)
                _clean = False

            data += [x for x in current_data if x not in data]
            display_msg = True

        if new_rules_file or change_in_rules:
            #if there's a new rules file or rules delta,
            #need to pick up the full rules for the next loop
            #incase we insert new data
            # --> we dont want to just apply the delta to the new data
            rules = coconnect.tools.load_json(current_rules_file)

        if ctx.obj['listen_for_changes'] == False:
            break

        if display_msg:
            logger.info(
                f"Finished!... Listening for changes to data in {config_file}")
            if display_msg:
                display_msg = False

        time.sleep(5)