Exemplo n.º 1
0
def drop_duplicates(ctx):
    bclink_helpers = ctx['bclink_helpers']
    logger = Logger("drop_duplicates")

    retval = {}
    logger.info("printing to see if tables exist")
    for cdm_table, bclink_table in bclink_helpers.table_map.items():
        #dont do this for person table
        #a person with the same sex and date of birth isnt a duplicate
        if cdm_table == "person":
            continue
        logger.info(f"Looking for duplicates in {cdm_table} ({bclink_table})")

        #if the table hasnt been created, skip
        exists = bclink_helpers.check_table_exists(bclink_table)
        if not exists:
            continue
        #find out what the primary key is
        droped_duplicates = bclink_helpers.drop_duplicates(bclink_table)
        if len(droped_duplicates) > 0:
            logger.warning(
                f"Found and dropped {len(droped_duplicates)} duplicates in {bclink_table}"
            )
Exemplo n.º 2
0
def _execute(ctx, rules=None, data=None, clean=None, bclink_helpers=None):

    if data == None:
        data = ctx.obj['data']
    if clean == None:
        clean = ctx.obj['clean']
    if rules == None:
        rules = ctx.obj['rules']
    if bclink_helpers == None:
        bclink_helpers = ctx.obj['bclink_helpers']

    interactive = ctx.obj['interactive']
    steps = ctx.obj['steps']

    ctx.obj['listen_for_changes'] = all(
        [step in steps for step in ['extract', 'transform', 'load']])

    check_and_drop_duplicates = 'drop_duplicates' in steps

    logger = Logger("execute")
    logger.info(f"Executing steps {steps}")

    if clean and 'clean' in steps:
        logger.info(f"cleaning existing bclink tables")
        ctx.invoke(clean_tables, data=data)

    tables = list(rules['cdm'].keys())
    if interactive and ('extract' in steps or 'transform' in steps):
        choices = []
        #location = f"{output_folder}/{name}"
        for table in tables:
            source_tables = [
                f"{data['input']}/{x}"
                for x in coconnect.tools.get_source_tables_from_rules(
                    rules, table)
            ]
            choices.append((f"{table} ({source_tables})", table))
        questions = [
            inquirer.Checkbox('tables',
                              message=f"Confirm executing ETL for ... ",
                              choices=choices,
                              default=tables)
        ]
        answers = inquirer.prompt(questions)
        if answers == None:
            os.kill(os.getpid(), signal.SIGINT)
        tables = answers['tables']
        if len(tables) == 0:
            logger.info("no tables selected, skipping..")
            return
        rules = coconnect.tools.filter_rules_by_destination_tables(
            rules, tables)
        logger.info(f'cdm tables: {tables}')

    logger.info(f"Executing ETL...")

    #call any extracting of data
    #----------------------------------
    extract_data = _extract(ctx, data, rules, bclink_helpers)
    indexer = extract_data.get('indexer')
    existing_global_ids = extract_data.get('existing_global_ids')
    data = extract_data.get('data')

    #----------------------------------

    inputs = data['input']
    output_folder = data['output']

    #call transform
    #----------------------------------
    _transform(ctx, rules, inputs, output_folder, indexer, existing_global_ids)
    #----------------------------------
    #remove this lookup file once done with it
    if existing_global_ids and os.path.exists(existing_global_ids):
        os.remove(existing_global_ids)

    if 'load' not in steps:
        logger.info("done!")
        return

    cdm_tables = coconnect.tools.get_files(output_folder, type='tsv')
    if interactive:
        choices = []
        for x in cdm_tables:
            tab = os.path.splitext(os.path.basename(x))[0]
            bctab = bclink_helpers.get_bclink_table(tab)
            text = f"{x} --> {bctab} ({tab})"
            choices.append((text, x))
        options = [
            inquirer.Checkbox('cdm_tables',
                              message="Choose which CDM tables to load..",
                              choices=choices,
                              default=cdm_tables),
        ]
        answers = inquirer.prompt(options)
        if answers == None:
            os.kill(os.getpid(), signal.SIGINT)
        tables_to_load = answers['cdm_tables']
        cdm_tables = tables_to_load
        if len(cdm_tables) == 0:
            logger.info("No tables chosen to be loaded..")
            return
        else:
            logger.info("Chosen to load...")
            logger.warning(cdm_tables)

    cdm_tables = [os.path.splitext(os.path.basename(x))[0] for x in cdm_tables]

    try:
        idx_global_ids = cdm_tables.index('global_ids')
        global_ids = cdm_tables.pop(idx_global_ids)
    except ValueError:
        global_ids = None

    #call load
    #----------------------------------
    _load(ctx, output_folder, cdm_tables, global_ids, bclink_helpers)

    if check_and_drop_duplicates:
        #final check for duplicates
        logger.info(f"looking for duplicates and deleting any")
        ctx.invoke(drop_duplicates)

    bclink_helpers.print_report()
    logger.info("done!")
Exemplo n.º 3
0
def _extract(ctx, data, rules, bclink_helpers):
    if not 'extract' in ctx.obj['steps']:
        return {'data': data}

    logger = Logger("extract")
    logger.info(f"starting extraction processes")

    inputs = data['input']
    if isinstance(inputs, str):
        if not os.path.exists(inputs):
            raise Exception(f"{inputs} is not an existing path")
        if not os.path.isdir(inputs):
            raise Exception(f"{inputs} is not a dir!")
        inputs = coconnect.tools.get_files(inputs)
        if len(inputs) == 0:
            raise Exception(f"No .csv files found in {inputs}")

    do_pseudonymise = False
    _pseudonymise = {}
    if 'pseudonymise' in data:
        _pseudonymise = data['pseudonymise']
        do_pseudonymise = True
        if 'do' in _pseudonymise:
            do_pseudonymise = _pseudonymise['do']

    if do_pseudonymise:
        chunksize = 1000
        if 'chunksize' in _pseudonymise:
            chunksize = _pseudonymise['chunksize']

        output = "./pseudonymised_input_data/"
        if 'output' in _pseudonymise:
            output = _pseudonymise['output']

        if 'salt' not in _pseudonymise:
            raise Exception("To use pseudonymise a salt must be provided!")
        salt = _pseudonymise['salt']

        logger.info(f"Called do_pseudonymisation on input data {data} ")
        if not isinstance(rules, dict):
            rules = coconnect.tools.load_json(rules)
        person_id_map = coconnect.tools.get_person_ids(rules)

        input_map = {os.path.basename(x): x for x in inputs}

        inputs = []
        for table, person_id in person_id_map.items():
            if table not in input_map:
                logger.warning(f"Could not find table {table} in input_map")
                logger.warning(input_map)
                continue
            fin = input_map[table]

            print(fin)

            fout = ctx.invoke(pseudonymise,
                              input=fin,
                              output_folder=output,
                              chunksize=chunksize,
                              salt=salt,
                              person_id=person_id)
            inputs.append(fout)

        data.pop('pseudonymise')
        data['input'] = inputs

    _dir = data['output']
    f_global_ids = f"{_dir}/existing_global_ids.tsv"
    f_global_ids = bclink_helpers.get_global_ids(f_global_ids)

    indexer = bclink_helpers.get_indicies()
    return {
        'indexer': indexer,
        'data': data,
        'existing_global_ids': f_global_ids
    }
Exemplo n.º 4
0
def _process_dict_data(ctx):
    logger = Logger("_process_dict_data")
    logger.info("ETL process has begun")

    interactive = ctx.obj['interactive']
    data = ctx.obj['data']
    clean = ctx.obj['clean']
    rules = ctx.obj['rules']
    bclink_helpers = ctx.obj['bclink_helpers']

    bclink_helpers.print_summary()

    #calculate the amount of time to wait before checking for changes
    tdelta = None
    if 'watch' in data:
        watch = data['watch']
        tdelta = datetime.timedelta(**watch)

    #get the input folder to watch
    input_folder = data['input']
    #get the root output folder
    output_folder = data['output']

    i = 0
    while True:
        #find subfolders containing data dumps
        subfolders = coconnect.tools.get_subfolders(input_folder)
        # if len(subfolders)>0:
        #     logger.info(f"Found {len(subfolders)} subfolders at path '{input_folder}'")
        # if interactive and len(subfolders)>0:
        #     questions = [
        #         inquirer.Checkbox('folders',
        #                           message="Confirm processing the following subfolders.. ",
        #                           choices=subfolders,
        #                           default=subfolders
        #                           )
        #         ]
        #     answers = inquirer.prompt(questions)
        #     if answers == None:
        #         os.kill(os.getpid(), signal.SIGINT)

        #     subfolders = {k:v for k,v in subfolders.items() if k in answers['folders']}
        #     logger.info(f"selected {subfolders}")

        logger.debug(
            f"Found and checking {len(subfolders.values())} subfolders")
        logger.debug(list(subfolders.values()))

        if len(subfolders.values()) > 0:
            logger.debug(f"{list(subfolders.values())}")

        njobs = 0
        #print (reversed(sorted(subfolders.items(),key=lambda x: os.path.getmtime(x[1]))))
        for name, path in sorted(subfolders.items(),
                                 key=lambda x: os.path.getmtime(x[1])):
            output_folder_exists = os.path.exists(f"{output_folder}/{name}")

            inputs = coconnect.tools.get_files(path, type='csv')
            filtered_rules = coconnect.tools.remove_missing_sources_from_rules(
                rules, inputs)

            if output_folder_exists:
                output_tables = [
                    os.path.splitext(os.path.basename(x))[0]
                    for x in coconnect.tools.get_files(
                        f"{output_folder}/{name}", type='tsv')
                ]

                expected_outputs = list(filtered_rules['cdm'].keys())
                to_process = list(set(expected_outputs) - set(output_tables))

                if len(to_process) == 0:
                    continue

                filtered_rules = coconnect.tools.filter_rules_by_destination_tables(
                    filtered_rules, to_process)

            logger.debug(f"New data found!")
            logger.info(f"Creating a new task for processing {path}")

            if len(inputs) == 0:
                logger.critical(f"Subfolder contains no .csv files!")
                continue

            tables = list(filtered_rules['cdm'].keys())
            logger.debug(f'inputs: {inputs}')
            logger.info(f'cdm tables: {tables}')

            _data = copy.deepcopy(data)
            _data['input'] = inputs
            _data['output'] = f"{output_folder}/{name}"

            _execute(ctx,
                     data=_data,
                     rules=filtered_rules,
                     clean=clean if (i == 0 and njobs == 0) else False)
            njobs += 1

        if tdelta is None:
            break

        if njobs > 0 or i == 0:
            logger.info(
                f"Refreshing {input_folder} every {tdelta} to look for new subfolders...."
            )
            if len(subfolders.values()) == 0:
                logger.warning("No subfolders for data dumps yet found...")

        i += 1
        time.sleep(tdelta.total_seconds())