def execute(ctx, run_as_daemon): logger = Logger("Execute") if run_as_daemon and daemon is None: raise ImportError( f"You are trying to run in daemon mode, " "but the package 'daemon' hasn't been installed. " "pip install python-daemon. \n" "If you are running on a Windows machine, this package is not supported" ) if run_as_daemon and daemon is not None: stderr = ctx.obj['log'] stdout = f'{stderr}.out' logger.info(f"running as a daemon process, logging to {stderr}") pidfile = TimeoutPIDLockFile('etl.pid', -1) logger.info(f"process_id in {pidfile}") with open(stdout, 'w+') as stdout_handle, open(stderr, 'w+') as stderr_handle: d_ctx = daemon.DaemonContext(working_directory=os.getcwd(), stdout=stdout_handle, stderr=stderr_handle, pidfile=TimeoutPIDLockFile( 'etl.pid', -1)) with d_ctx: _process_data(ctx) else: _process_data(ctx)
def check_tables(ctx): bclink_helpers = ctx['bclink_helpers'] logger = Logger("check_tables") retval = {} logger.info("printing to see if tables exist") for bclink_table in bclink_helpers.table_map.values(): retval[bclink_table] = bclink_helpers.check_table_exists(bclink_table) if bclink_helpers.global_ids: retval[bclink_helpers.global_ids] = bclink_helpers.check_table_exists( bclink_helpers.global_ids) logger.info(json.dumps(retval, indent=6)) return retval
def delete_data(ctx): bclink_helpers = ctx['bclink_helpers'] logger = Logger("delete_data") logger.info("deleting data...") data = ctx['data'] input_data = data['input'] output_data = data['output'] folders = coconnect.tools.get_subfolders(output_data) options = [ inquirer.Checkbox('folders', message="Which data-dump do you want to remove?", choices=list(folders.values())), ] selected_folders = inquirer.prompt(options) if selected_folders == None: os.kill(os.getpid(), signal.SIGINT) selected_folders = selected_folders["folders"] for selected_folder in selected_folders: files = coconnect.tools.get_files(selected_folder, type='tsv') options = [ inquirer.Checkbox( 'files', message="Confirm the removal of the following tsv files.. ", choices=files, default=files), ] selected_files = inquirer.prompt(options) if selected_files == None: os.kill(os.getpid(), signal.SIGINT) selected_files = selected_files["files"] for f in selected_files: bclink_helpers.remove_table(f) click.echo(f"Deleting {f}") os.remove(f)
def remove_missing_sources_from_rules(rules,tables): logger = Logger("remove_missing_sources_from_rules") tables = [os.path.basename(x) for x in tables] rules_copy = copy.deepcopy(rules) for destination_table,cdm_table in rules['cdm'].items(): for table_name,sub_table in cdm_table.items(): first = list(sub_table.keys())[0] source_table = sub_table[first]['source_table'] if source_table not in tables: rules_copy['cdm'][destination_table].pop(table_name) logger.debug(f"removed {table_name} from rules") if not rules_copy['cdm'][destination_table]: rules_copy['cdm'].pop(destination_table) logger.debug(f"removed cdm table '{destination_table}' from rules") return rules_copy
def diff_csv(file1,file2,separator=None,nrows=None): logger = Logger("CSV File Diff") if separator == None: sep1 = get_separator_from_filename(file1) sep2 = get_separator_from_filename(file2) else: sep1 = separator sep2 = separator df1 = pd.read_csv(file1,sep=sep1,nrows=nrows) df2 = pd.read_csv(file2,sep=sep2,nrows=nrows) exact_match = df1.equals(df2) if exact_match: return df = pd.concat([df1,df2]).drop_duplicates(keep=False) if len(df) > 0: logger.error(" ======== Differing Rows ========== ") logger.error(df) m = df1.merge(df2, on=df.columns[0], how='outer', suffixes=['', '_'], indicator=True)[['_merge']] m = m[~m['_merge'].str.contains('both')] file1 = file1.split('/')[-1] file2 = file2.split('/')[-1] m['_merge'] = m['_merge'].map({'left_only':file1,'right_only':file2}) m = m.rename(columns={'_merge':'Only Contained Within'}) m.index.name = 'Row Number' logger.error(m.reset_index().to_dict(orient='records')) raise DifferingRows("Something not right with the rows, changes detected.") elif len(df1.columns) != len(df2.columns): raise DifferingColumns('in df1 but not df2',list(set(df1.columns) - set(df2.columns)),'\n', 'in df2 but not df1',list(set(df2.columns) - set(df1.columns))) else: logger.error(" ======= Rows are likely in a different order ====== ") for i in range(len(df1)): if not (df1.iloc[i] == df2.iloc[i]).any(): print ('Row',i,'is in a different location') raise Exception("differences detected")
def coconnect(ctx, version, log_level, cprofile): if ctx.invoked_subcommand == None: if version: click.echo(cc.__version__) else: click.echo(ctx.get_help()) return cc.params['debug_level'] = int(log_level) log = Logger("coconnect") if cprofile: import cProfile log.info("Profiling...") pr = cProfile.Profile() pr.enable() def callback(): pr.disable() log.info("Profiling completed") pr.dump_stats('coconnect.prof') ctx.call_on_close(callback)
def _load(ctx, output_folder, cdm_tables, global_ids, bclink_helpers): if not 'load' in ctx.obj['steps']: return logger = Logger("load") logger.info("starting loading data processes") logger.info("starting loading global ids") if global_ids: bclink_helpers.load_global_ids(output_folder) logger.info("starting loading cdm tables") bclink_helpers.load_tables(output_folder, cdm_tables)
def drop_duplicates(ctx): bclink_helpers = ctx['bclink_helpers'] logger = Logger("drop_duplicates") retval = {} logger.info("printing to see if tables exist") for cdm_table, bclink_table in bclink_helpers.table_map.items(): #dont do this for person table #a person with the same sex and date of birth isnt a duplicate if cdm_table == "person": continue logger.info(f"Looking for duplicates in {cdm_table} ({bclink_table})") #if the table hasnt been created, skip exists = bclink_helpers.check_table_exists(bclink_table) if not exists: continue #find out what the primary key is droped_duplicates = bclink_helpers.drop_duplicates(bclink_table) if len(droped_duplicates) > 0: logger.warning( f"Found and dropped {len(droped_duplicates)} duplicates in {bclink_table}" )
def clean_tables(ctx, skip_local_folder, data=None): bclink_helpers = ctx['bclink_helpers'] interactive = ctx['interactive'] logger = Logger("clean_tables") if data is None: data = ctx['data'] if isinstance(data, dict): output_folders = [data['output']] else: output_folders = [x['output'] for x in data] if interactive: tables = list(bclink_helpers.table_map.values()) choices = [(f"{v} ({k})", v) for k, v in bclink_helpers.table_map.items()] tables.append(bclink_helpers.global_ids) choices.append((f'{bclink_helpers.global_ids} (global_ids)', bclink_helpers.global_ids)) questions = [ inquirer.Checkbox( 'clean_tables', message=f"Clean all-rows in which BCLink tables?", choices=choices, default=tables) ] answers = inquirer.prompt(questions) if answers == None: os.kill(os.getpid(), signal.SIGINT) tables = answers['clean_tables'] bclink_helpers.clean_tables(tables) else: bclink_helpers.clean_tables() if skip_local_folder: return if interactive: questions = [ inquirer.Checkbox('output_folders', message=f"Clean the following output folders?", choices=output_folders, default=output_folders) ] answers = inquirer.prompt(questions) if answers == None: os.kill(os.getpid(), signal.SIGINT) output_folders = answers['output_folders'] for output_folder in output_folders: if not os.path.exists(output_folder): logger.info(f"No folder to remove at '{output_folder}'") continue if os.path.exists(output_folder) and os.path.isdir(output_folder): logger.info(f"removing {output_folder}") shutil.rmtree(output_folder)
def load_json_delta(f_in,original): logger = Logger("load_json_delta") logger.info(f"loading a json from '{f_in}' as a delta") data = load_json(f_in) original_date = original['metadata']['date_created'] data_date = data['metadata']['date_created'] logger.info(f"Original JSON date: {original_date}") logger.info(f"New JSON date: {data_date}") _data = copy.deepcopy(data) for destination_table,rule_set in data['cdm'].items(): for name,rules in rule_set.items(): exists_in_original_rules = None if destination_table in original['cdm']: if name in original['cdm'][destination_table]: exists_in_original_rules = original['cdm'][destination_table][name] if exists_in_original_rules: _data['cdm'][destination_table].pop(name) else: logger.info(f"Detected a new rule for {name}") logger.debug(json.dumps(rules,indent=6)) if not _data['cdm'][destination_table]: _data['cdm'].pop(destination_table) logger.info(json.dumps(_data,indent=6)) return _data
def manual(ctx, rules, inputs, output_folder, clean, table_map, gui_user, user, database, dry_run): _rules = coconnect.tools.load_json(rules) destination_tables = list(_rules['cdm'].keys()) data = {'input': list(inputs), 'output': output_folder} table_map = _get_table_map(table_map, destination_tables) bclink_settings = { 'user': user, 'gui_user': gui_user, 'database': database, 'dry_run': dry_run, 'tables': table_map, } logger = Logger("Manual") logger.info(f'Rules: {rules}') logger.info(f'Inputs: {data["input"]}') logger.info(f'Output: {data["output"]}') logger.info(f'Clean Tables: {clean}') logger.info(f'Processing {destination_tables}') logger.info(f'BCLink settings:') logger.info(json.dumps(bclink_settings, indent=6)) bclink_helpers = BCLinkHelpers(**bclink_settings) _execute(ctx, rules, data, clean, bclink_helpers)
def _execute(ctx, rules=None, data=None, clean=None, bclink_helpers=None): if data == None: data = ctx.obj['data'] if clean == None: clean = ctx.obj['clean'] if rules == None: rules = ctx.obj['rules'] if bclink_helpers == None: bclink_helpers = ctx.obj['bclink_helpers'] interactive = ctx.obj['interactive'] steps = ctx.obj['steps'] ctx.obj['listen_for_changes'] = all( [step in steps for step in ['extract', 'transform', 'load']]) check_and_drop_duplicates = 'drop_duplicates' in steps logger = Logger("execute") logger.info(f"Executing steps {steps}") if clean and 'clean' in steps: logger.info(f"cleaning existing bclink tables") ctx.invoke(clean_tables, data=data) tables = list(rules['cdm'].keys()) if interactive and ('extract' in steps or 'transform' in steps): choices = [] #location = f"{output_folder}/{name}" for table in tables: source_tables = [ f"{data['input']}/{x}" for x in coconnect.tools.get_source_tables_from_rules( rules, table) ] choices.append((f"{table} ({source_tables})", table)) questions = [ inquirer.Checkbox('tables', message=f"Confirm executing ETL for ... ", choices=choices, default=tables) ] answers = inquirer.prompt(questions) if answers == None: os.kill(os.getpid(), signal.SIGINT) tables = answers['tables'] if len(tables) == 0: logger.info("no tables selected, skipping..") return rules = coconnect.tools.filter_rules_by_destination_tables( rules, tables) logger.info(f'cdm tables: {tables}') logger.info(f"Executing ETL...") #call any extracting of data #---------------------------------- extract_data = _extract(ctx, data, rules, bclink_helpers) indexer = extract_data.get('indexer') existing_global_ids = extract_data.get('existing_global_ids') data = extract_data.get('data') #---------------------------------- inputs = data['input'] output_folder = data['output'] #call transform #---------------------------------- _transform(ctx, rules, inputs, output_folder, indexer, existing_global_ids) #---------------------------------- #remove this lookup file once done with it if existing_global_ids and os.path.exists(existing_global_ids): os.remove(existing_global_ids) if 'load' not in steps: logger.info("done!") return cdm_tables = coconnect.tools.get_files(output_folder, type='tsv') if interactive: choices = [] for x in cdm_tables: tab = os.path.splitext(os.path.basename(x))[0] bctab = bclink_helpers.get_bclink_table(tab) text = f"{x} --> {bctab} ({tab})" choices.append((text, x)) options = [ inquirer.Checkbox('cdm_tables', message="Choose which CDM tables to load..", choices=choices, default=cdm_tables), ] answers = inquirer.prompt(options) if answers == None: os.kill(os.getpid(), signal.SIGINT) tables_to_load = answers['cdm_tables'] cdm_tables = tables_to_load if len(cdm_tables) == 0: logger.info("No tables chosen to be loaded..") return else: logger.info("Chosen to load...") logger.warning(cdm_tables) cdm_tables = [os.path.splitext(os.path.basename(x))[0] for x in cdm_tables] try: idx_global_ids = cdm_tables.index('global_ids') global_ids = cdm_tables.pop(idx_global_ids) except ValueError: global_ids = None #call load #---------------------------------- _load(ctx, output_folder, cdm_tables, global_ids, bclink_helpers) if check_and_drop_duplicates: #final check for duplicates logger.info(f"looking for duplicates and deleting any") ctx.invoke(drop_duplicates) bclink_helpers.print_report() logger.info("done!")
def _transform(ctx, rules, inputs, output_folder, indexer, existing_global_ids): if not 'transform' in ctx.obj['steps']: return logger = Logger("transform") logger.info("starting data transform processes") if isinstance(inputs, str): inputs = [inputs] logger.info(f"inputs: {inputs}") logger.info(f"output_folder: {output_folder}") logger.info(f"indexer: {indexer}") logger.info(f"existing_global_ids: {existing_global_ids}") ctx.invoke(run, rules=rules, inputs=inputs, output_folder=output_folder, indexing_conf=indexer, person_id_map=existing_global_ids)
def _process_dict_data(ctx): logger = Logger("_process_dict_data") logger.info("ETL process has begun") interactive = ctx.obj['interactive'] data = ctx.obj['data'] clean = ctx.obj['clean'] rules = ctx.obj['rules'] bclink_helpers = ctx.obj['bclink_helpers'] bclink_helpers.print_summary() #calculate the amount of time to wait before checking for changes tdelta = None if 'watch' in data: watch = data['watch'] tdelta = datetime.timedelta(**watch) #get the input folder to watch input_folder = data['input'] #get the root output folder output_folder = data['output'] i = 0 while True: #find subfolders containing data dumps subfolders = coconnect.tools.get_subfolders(input_folder) # if len(subfolders)>0: # logger.info(f"Found {len(subfolders)} subfolders at path '{input_folder}'") # if interactive and len(subfolders)>0: # questions = [ # inquirer.Checkbox('folders', # message="Confirm processing the following subfolders.. ", # choices=subfolders, # default=subfolders # ) # ] # answers = inquirer.prompt(questions) # if answers == None: # os.kill(os.getpid(), signal.SIGINT) # subfolders = {k:v for k,v in subfolders.items() if k in answers['folders']} # logger.info(f"selected {subfolders}") logger.debug( f"Found and checking {len(subfolders.values())} subfolders") logger.debug(list(subfolders.values())) if len(subfolders.values()) > 0: logger.debug(f"{list(subfolders.values())}") njobs = 0 #print (reversed(sorted(subfolders.items(),key=lambda x: os.path.getmtime(x[1])))) for name, path in sorted(subfolders.items(), key=lambda x: os.path.getmtime(x[1])): output_folder_exists = os.path.exists(f"{output_folder}/{name}") inputs = coconnect.tools.get_files(path, type='csv') filtered_rules = coconnect.tools.remove_missing_sources_from_rules( rules, inputs) if output_folder_exists: output_tables = [ os.path.splitext(os.path.basename(x))[0] for x in coconnect.tools.get_files( f"{output_folder}/{name}", type='tsv') ] expected_outputs = list(filtered_rules['cdm'].keys()) to_process = list(set(expected_outputs) - set(output_tables)) if len(to_process) == 0: continue filtered_rules = coconnect.tools.filter_rules_by_destination_tables( filtered_rules, to_process) logger.debug(f"New data found!") logger.info(f"Creating a new task for processing {path}") if len(inputs) == 0: logger.critical(f"Subfolder contains no .csv files!") continue tables = list(filtered_rules['cdm'].keys()) logger.debug(f'inputs: {inputs}') logger.info(f'cdm tables: {tables}') _data = copy.deepcopy(data) _data['input'] = inputs _data['output'] = f"{output_folder}/{name}" _execute(ctx, data=_data, rules=filtered_rules, clean=clean if (i == 0 and njobs == 0) else False) njobs += 1 if tdelta is None: break if njobs > 0 or i == 0: logger.info( f"Refreshing {input_folder} every {tdelta} to look for new subfolders...." ) if len(subfolders.values()) == 0: logger.warning("No subfolders for data dumps yet found...") i += 1 time.sleep(tdelta.total_seconds())
def load_csv(_map,chunksize=None,dtype=str,nrows=None,lower_col_names=False,load_path="",rules=None,sep=',',na_values=['']): if isinstance(_map,list): _map = { os.path.basename(x):x for x in _map } logger = Logger("coconnect.tools.load_csv") if isinstance(_map,list) or isinstance(_map,tuple): _map = { x:x for x in _map} elif isinstance(_map,str): _map = { _map:_map } if rules is not None: logger.debug("rules .json file supplied") if not isinstance(rules,dict): rules = load_json(rules) source_map = get_mapped_fields_from_rules(rules) inputs_from_json = list(source_map.keys()) inputs_from_cli = list(_map.keys()) if len(inputs_from_cli) == 0: raise MissingInputFiles (f"You haven't loaded any input files!") logger.debug(f"{len(inputs_from_cli)} input files loaded") logger.debug(f"{inputs_from_cli}") missing_inputs = list(set(inputs_from_json) - set(inputs_from_cli)) if len(missing_inputs) > 0 : raise MissingInputFiles (f"Found the following files {missing_inputs} in the json file, that are not in the loaded file list... {inputs_from_cli}") #reduce the mapping of inputs, if we dont need them all _map = { k: { 'file':v, 'fields':source_map[k] } for k,v in _map.items() if k in source_map } if not nrows is None: chunksize = nrows if chunksize is None else chunksize retval = local.LocalDataCollection(chunksize=chunksize) for key,obj in _map.items(): fields = None if isinstance(obj,str): fname = obj else: fname = obj['file'] fields = obj['fields'] df = pd.read_csv(load_path+fname, chunksize=chunksize, #iterator=True, nrows=nrows, sep=sep, keep_default_na=False, na_values=na_values, dtype=dtype, usecols=fields) df.attrs = {'original_file':load_path+fname} if isinstance(df,pd.DataFrame): #this should be removed if lower_col_names: df.columns = df.columns.str.lower() retval[key] = local.DataBrick(df,name=key) return retval
def load(ctx): logger = Logger("Load") logger.info("doing load only") ctx.obj['steps'] = ['load'] ctx.invoke(execute)
def transform(ctx): logger = Logger("Transform") logger.info("doing transform only") ctx.obj['steps'] = ['transform'] ctx.invoke(execute)
def extract(ctx): logger = Logger("Extract") logger.info("doing extract only") ctx.obj['steps'] = ['extract'] ctx.invoke(execute)
def _extract(ctx, data, rules, bclink_helpers): if not 'extract' in ctx.obj['steps']: return {'data': data} logger = Logger("extract") logger.info(f"starting extraction processes") inputs = data['input'] if isinstance(inputs, str): if not os.path.exists(inputs): raise Exception(f"{inputs} is not an existing path") if not os.path.isdir(inputs): raise Exception(f"{inputs} is not a dir!") inputs = coconnect.tools.get_files(inputs) if len(inputs) == 0: raise Exception(f"No .csv files found in {inputs}") do_pseudonymise = False _pseudonymise = {} if 'pseudonymise' in data: _pseudonymise = data['pseudonymise'] do_pseudonymise = True if 'do' in _pseudonymise: do_pseudonymise = _pseudonymise['do'] if do_pseudonymise: chunksize = 1000 if 'chunksize' in _pseudonymise: chunksize = _pseudonymise['chunksize'] output = "./pseudonymised_input_data/" if 'output' in _pseudonymise: output = _pseudonymise['output'] if 'salt' not in _pseudonymise: raise Exception("To use pseudonymise a salt must be provided!") salt = _pseudonymise['salt'] logger.info(f"Called do_pseudonymisation on input data {data} ") if not isinstance(rules, dict): rules = coconnect.tools.load_json(rules) person_id_map = coconnect.tools.get_person_ids(rules) input_map = {os.path.basename(x): x for x in inputs} inputs = [] for table, person_id in person_id_map.items(): if table not in input_map: logger.warning(f"Could not find table {table} in input_map") logger.warning(input_map) continue fin = input_map[table] print(fin) fout = ctx.invoke(pseudonymise, input=fin, output_folder=output, chunksize=chunksize, salt=salt, person_id=person_id) inputs.append(fout) data.pop('pseudonymise') data['input'] = inputs _dir = data['output'] f_global_ids = f"{_dir}/existing_global_ids.tsv" f_global_ids = bclink_helpers.get_global_ids(f_global_ids) indexer = bclink_helpers.get_indicies() return { 'indexer': indexer, 'data': data, 'existing_global_ids': f_global_ids }
def _process_list_data(ctx): logger = Logger("_process_list_data") logger.info("ETL process has begun") interactive = ctx.obj['interactive'] data = [] clean = ctx.obj['clean'] rules = ctx.obj['rules'] bclink_helpers = ctx.obj['bclink_helpers'] config_file = ctx.obj['conf'] conf = _load_config(config_file) rules_file = conf['rules'] rules_file_last_modified = os.path.getmtime(rules_file) bclink_helpers.print_summary() display_msg = True _clean = clean while True: re_execute = False try: conf = _load_config(config_file) except Exception as e: if not display_msg: logger.critical(e) logger.error( f"You've misconfigured your file '{config_file}'!! Please fix!" ) time.sleep(5) display_msg = True continue current_rules_file = conf['rules'] new_rules_file = rules_file != current_rules_file if new_rules_file: #if there's a new rules file logger.info( f"Detected a new rules file.. old was '{rules_file}' and new is '{current_rules_file}'" ) rules_file = current_rules_file rules = coconnect.tools.load_json_delta(rules_file, rules) rules_file_last_modified = os.path.getmtime(rules_file) re_execute = True else: #otherwise check for changes in the existing file new_rules_file_last_modified = os.path.getmtime(current_rules_file) change_in_rules = rules_file_last_modified != new_rules_file_last_modified if change_in_rules: logger.info( f"Detected a change/update in the rules file '{rules_file}'" ) rules = coconnect.tools.load_json_delta( current_rules_file, rules) re_execute = True current_data = conf['data'] if not data == current_data: logger.debug(f"old {data}") logger.debug(f"new {current_data}") new_data = [obj for obj in current_data if obj not in data] logger.info(f"New data found! {new_data}") re_execute = True else: new_data = data logger.debug(f"re-execute {re_execute}") if re_execute: current_data = copy.deepcopy(new_data) #loop over any new data for item in new_data: if isinstance(item['input'], list): inputs = item['input'] else: input_folder = item['input'] if not os.path.isdir(input_folder): raise Exception( f"{input_folder} is not a directory containing files!" ) inputs = coconnect.tools.get_files(input_folder, type='csv') filtered_rules = coconnect.tools.remove_missing_sources_from_rules( rules, inputs) _execute(ctx, data=item, rules=filtered_rules, clean=_clean) _clean = False data += [x for x in current_data if x not in data] display_msg = True if new_rules_file or change_in_rules: #if there's a new rules file or rules delta, #need to pick up the full rules for the next loop #incase we insert new data # --> we dont want to just apply the delta to the new data rules = coconnect.tools.load_json(current_rules_file) if ctx.obj['listen_for_changes'] == False: break if display_msg: logger.info( f"Finished!... Listening for changes to data in {config_file}") if display_msg: display_msg = False time.sleep(5)