Пример #1
0
def create_summary_diagnosis(join_tables_output):    
    try:
        #Test If Previous Node Has Completed Successfully
        if join_tables_output is not None:
           
            sql_script = summary_discharge_diagnosis_query()
            inject_sql(sql_script, "create-summary-diagnosis")
            #Add Return Value For Kedro Not To Throw Data Error
            return dict(
            status='Success',
            message = "Creating Summary Diagnosis Complete"
            )
        else:
            logging.error(
                "Creating Summary Diagnosis Did Not Execute To Completion")
            return None

    except Exception as e:
        logging.error("!!! An error occured creating summary diagnosis: ")
        cron_log = open(cron_log_file,"a+")
        #cron_log = open("C:\/Users\/morris\/Documents\/BRTI\/logs\/data_pipeline_cron.log","a+")
        cron_log.write("StartTime: {0}   Instance: {1}   Status: Failed   Stage: Creating Summary Diagnosis ".format(cron_time,mode))
        cron_log.close()
        logging.error(formatError(e))
        sys.exit(1)
def create_summary_vitalsigns():
    vital_signs_count = 0
    tble_exists = False
    try:
        tble_exists = table_exists('derived','vitalsigns');
        if tble_exists:
                vital_signs_count = table_data_count('derived','vitalsigns')

        if (vital_signs_count> 0):
           
            summary_vitals_script = summary_vital_signs_query()
            summary_vitals_day1_script = summary_day_one_vitals_query()
            summary_vitals_day2_script = summary_day_two_vitals_query()
            summary_vitals_day3_script = summary_day_three_vitals_query()
               
                # Run Summary Vital Signs Query
            inject_sql(summary_vitals_script, "create-summary-vital-signs")
                # Run Day1 Summary Vital Signs Query
            inject_sql(summary_vitals_day1_script, "create-summary-day1-vital-signs")

                    # Run Day2 Summary Vital Signs Query
            inject_sql(summary_vitals_day2_script, "create-summary-day2-vital-signs")

                    # Run Day3 Summary Vital Signs Query
            inject_sql(summary_vitals_day3_script, "create-summary-day3-vital-signs")
        else:
            pass;

    except Exception as e:
        logging.error("!!! An error occured creating Vital Signs Summaries: ")
        cron_log = open(cron_log_file,"a+")
        cron_log.write("StartTime: {0}   Instance: {1}   Status: Failed   Stage: Creating Summary Maternal Outcomes ".format(cron_time,mode))
        cron_log.close()
        logging.error(formatError(e))
        sys.exit(1)
Пример #3
0
def create_summary_baseline(join_tables_output):
    tble_exists = False
    try:
        # Test if table exist before executing query
        tble_exists = table_exists('derived', 'baseline')
        #Test If Previous Node Has Completed Successfully
        if tble_exists:
            if join_tables_output is not None:

                sql_script = summary_baseline_query()
                inject_sql(sql_script, "create-summary-baseline")
                #Add Return Value For Kedro Not To Throw Data Error
                return dict(status='Success',
                            message="Creating Summary Baseline Complete")
            else:
                logging.error(
                    "Creating Summary Baseline Did Not Execute To Completion")
                return None
        else:
            return dict(status='Skipped')

    except Exception as e:
        logging.error("!!! An error occured creating summary baseline: ")
        cron_log = open(cron_log_file, "a+")
        #cron_log = open("C:\/Users\/morris\/Documents\/BRTI\/logs\/data_pipeline_cron.log","a+")
        cron_log.write(
            "StartTime: {0}   Instance: {1}   Status: Failed   Stage: Creating Summary Baseline "
            .format(cron_time, mode))
        cron_log.close()
        raise e
        logging.error(formatError(e))
        sys.exit(1)
Пример #4
0
def create_maternal_completeness_summary():
    try:
        maternal_completeness_count = 0
        mat_completeness_exists = False
        mat_completeness_exists = table_exists('derived',
                                               'maternity_completeness')

        if mat_completeness_exists:
            maternal_completeness_count = table_data_count(
                'derived', 'maternity_completeness')

        if (maternal_completeness_count > 0):

            sql_script = summary_maternal_completeness_query()
            inject_sql(sql_script, "create-summary-maternal-completeness")
        else:
            pass

    except Exception as e:
        logging.error("!!! An error occured creating Vital Signs Summaries: ")
        cron_log = open(cron_log_file, "a+")
        cron_log.write(
            "StartTime: {0}   Instance: {1}   Status: Failed   Stage: Creating Summary Maternal Completeness "
            .format(cron_time, mode))
        cron_log.close()
        logging.error(formatError(e))
        sys.exit(1)
def grant_privileges(create_summary_counts_output):
    try:
        #Test If Previous Node Has Completed Successfully
        if create_summary_counts_output:
            
            sql_script = grant_usage_query()
            inject_sql_procedure(sql_script, "grant-usage-on-tables")
            end = time.time()
            execution_time = end-start
            execution_time_seconds = 0
            execution_time_minutes = 0
            if execution_time > 0:
                execution_time_minutes = round(execution_time//60)
                execution_time_seconds = round(execution_time % 60)
                cron_log.write("StartTime: {0}   Instance: {1}   Status: Success  ExecutionTime: {2} mins {3} seconds \n".format(cron_time,mode,execution_time_minutes,execution_time_seconds))
                cron_log.close()

            #Add Return Value For Kedro Not To Throw Data Error
            return dict(
            status='Success',
            message = "Granting Priviledges Complete"
            )
        else:
            logging.error(
                "Granting Priviledges Complete Did Not Execute To Completion")

            return None

    except Exception as e:
        logging.error(
            "!!! An error occured Granting Priviledges: ")
        cron_log.write("StartTime: {0}   Instance: {1}   Status: Failed Stage: Granting Privileges".format(cron_time,mode))
        cron_log.close()
        logging.error(formatError(e))
        sys.exit(1)
Пример #6
0
def inject_sql_procedure(sql_script, file_name):
    try:
        engine.connect().execution_options(
            isolation_level="AUTOCOMMIT").execute(sql_script)
    except Exception as e:
        logging.error('Something went wrong with the SQL file')
        logging.error(formatError(e))
        sys.exit()
    logging.info('... {0} has successfully run'.format(file_name))
def manually_fix_admissions(tidy_data_output):
    try:
        #Test If Previous Node Has Completed Successfully
        if tidy_data_output is not None:
            sql_script = manually_fix_admissions_query()
            inject_sql(sql_script, "manually-fix-admissions")
            #Add Return Value For Kedro Not To Throw Data Error
            return dict(status='Success',
                        message="Manual Fixing Of Admissions Complete")
        else:
            logging.error(
                "Manual Fixing Of Admissions Did Not Execute To Completion")
            return None

    except Exception as e:
        logging.error("!!! An error occured manually fixing admissions: ")
        cron_log = open(cron_log_file, "a+")
        cron_log.write(
            "StartTime: {0}   Instance: {1}   Status: Failed Stage: Manually Fixing Admissions "
            .format(cron_time, mode))
        cron_log.close()
        logging.error(formatError(e))
        sys.exit(1)
def restructure_new_format(k, v, mcl):
    try:
        #Check If Multi Value Column
        if len(v['values']['label']) > 1:
            k = k
            v = v['values']
            mcl.append(k)

        else:
            if len(v['values']['label']) > 0 and len(v['values']['value']) > 0:
                k = str(k).strip()
                #  Unpack The Values Object To Get Single Values
                v = {
                    'label': v['values']['label'][0],
                    'value': v['values']['value'][0]
                }
                # #Add Other Values T MCL Columns For Exploding
                if str(k).endswith('Oth') or k == "AdmReason":
                    mcl.append(k)

        return k, v, mcl
    except Exception as ex:
        logging.error(v)
        logging.error(formatError(ex))
Пример #9
0
def get_key_values(data_raw):
    mcl = []
    # Will store the final list of uid, ingested_at & reformed key-value pairs
    data_new = []
    for index, row in data_raw.iterrows():
        # to store all the restructured keys & values for each row
        try:
            new_entry = {}
            # add uid and ingested_at first
            app_version = None
            if 'appVersion' in row:
                app_version = row['appVersion']
            if (app_version != None and app_version != ''):
                #Remove any Other Characters that are non-numeric
                app_version = int(''.join(d for d in app_version
                                          if d.isdigit()))
            if 'facility' in row:
                new_entry['facility'] = row['facility']

            # Convert All UIDS TO UPPER CASE
            new_entry['uid'] = str(row['uid']).upper()
            if 'ingested_at_admission' in row:
                new_entry['ingested_at'] = row['ingested_at_admission']
            if 'ingested_at_discharge' in row:
                new_entry['ingested_at'] = row['ingested_at_discharge']

            if 'started_at' in row:
                new_entry['started_at'] = row['started_at']

            if 'started_at' in row:
                new_entry['started_at'] = row['started_at']

            if 'completed_at' in row:
                new_entry['completed_at'] = row['completed_at']

            if 'ingested_at' in row:
                new_entry['ingested_at'] = row['ingested_at']

        # iterate through key, value and add to dict
            for c in row['entries']:

                #RECORDS FORMATTED WITH NEW FORMAT, CONTAINS THE jsonFormat Key and C is the Key
                if (app_version != '' and app_version != None and
                    (app_version > 454 or int(str(app_version)[:1]) >= 5)):
                    k, v, mcl = restructure_new_format(c, row['entries'][c],
                                                       mcl)
                    #SET UID FOR ZIM DISCHARGES WHICH COME WITH NULL UID NEW FORMAT
                    if ((k == 'NeoTreeID' or k == 'NUID_BC' or k == 'NUID_M'
                         or k == 'NUID_S') and new_entry['uid'] is None):
                        new_entry['uid'] = v.value

            #ELSE USE THE OLD FORMAT
                else:
                    k, v, mcl = restructure(c, mcl)
                #SET UID FOR ZIM DISCHARGES WHICH COME WITH NULL UID OLD FORMAT
                if ((k == 'NeoTreeID' or k == 'NUID_BC' or k == 'NUID_M'
                     or k == 'NUID_S') and new_entry['uid'] is None):
                    new_entry['uid'] = v.value
                new_entry[k] = v
        # for each row add all the keys & values to a list

            data_new.append(new_entry)
        except Exception as ex:
            logging.error(formatError(ex))

    return data_new, set(mcl)
mode = params['env']
interval = 1
cronDir = os.getcwd()
#The number of hours before next execution of the next job as set in the database.ini file
if 'cron_interval' in params:
    interval = int(params['cron_interval'])

try:
    # Set The User To Run The Cron Job
    cron = CronTab(user=True)
    # Set The Command To Run The Data Pipeline script and activate the virtual environment
    if cronDir is not None:
        job = cron.new(
            command='cd {0} && env/bin/python -m kedro run --env={1}'.format(
                cronDir, mode))
    else:
        logging.info(
            'Please specify directory to find your kedro project in your database.ini file'
        )
        sys.exit()
    # Set The Time For The Cron Job
    # Use job.minute for quick testing
    job.every(interval).hours()
    # Write the Job To CronTab
    cron.write(user=True)

except Exception as e:
    logging.error("!!Cron Job Failed To Start Due To Errors: ")
    logging.error(formatError(e))
    sys.exit(1)
Пример #11
0
def neolab_cleanup(df: pd.DataFrame, position):

    try:
        if "Org1.label" in df.columns:
            if str(df.at[position, "Org1.label"]).lower().strip().find(
                    "coagulase negative staph") > -1:
                df.at[position,
                      "Org1.label"] = 'Coagulase negative staphylococcus'
            if df.at[
                    position,
                    "Org1.value"] == 'Oth' and "OtherOrg1.value" in df.columns:
                # CONS
                if (str(df.at[position,
                              "OtherOrg1.value"]).lower().find("staphyloc")
                        > -1 or str(df.at[position, "OtherOrg1.value"]).lower(
                        ).find("coagulase negative") > -1
                        or str(df.at[position, "OtherOrg1.value"]).lower(
                        ).find("stapgylococcus")):
                    df.at[position,
                          "Org1.label"] = 'Coagulase negative staphylococcus'
                    df.at[position, "Org1.value"] = 'CONS'
                    #df.at[position,"OtherOrg1.value"] = None
                # Klebsiella
                if (str(df.at[position,
                              "OtherOrg1.value"]).lower().find("klesiella")
                        > -1 or str(df.at[position, "OtherOrg1.value"]).lower(
                        ).find("klebsiella") > -1
                        or str(df.at[position,
                                     "OtherOrg1.value"]).lower().find("kleb") >
                        -1):
                    df.at[position, "Org1.label"] = 'Klebsiella sp.'
                    df.at[position, "Org1.value"] = 'KLS'
                    #df.at[position,"OtherOrg1.value"] = None
                # Streptococcus pyogenes
                if (str(df.at[position, "OtherOrg1.value"]).lower().find(
                        "streptococcus pyogenes") > -1
                        or str(df.at[position, "OtherOrg1.value"]).lower(
                        ).find("streptococcus pygenes") > -1
                        or str(df.at[position, "OtherOrg1.value"]).lower(
                        ).find("streptococcus pyoges") > -1
                        or str(df.at[position, "OtherOrg1.value"]).lower(
                        ).find("s payogenes") > -1
                        or str(df.at[position, "OtherOrg1.value"]).lower(
                        ).find("strptococcus pyogenes") > -1
                        or str(df.at[position, "OtherOrg1.value"]).lower(
                        ).find("b-haemolytic strep") > -1
                        or str(df.at[position, "OtherOrg1.value"]).lower(
                        ).find("streptococcus agalactiae") > -1):
                    df.at[
                        position,
                        "Org1.label"] = 'Streptococcus pyogenes (Group A Beta haemolytic Strep)'
                    df.at[position, "Org1.value"] = 'StrepPy'
                    #df.at[position,"OtherOrg1.value"] = None
                #Streptococcus species
                if (str(df.at[position, "OtherOrg1.value"]).lower().find(
                        "streptococcus species") > -1
                        or str(df.at[position, "OtherOrg1.value"]).lower(
                        ).find("streptococcus species") > -1):
                    df.at[position, "Org1.label"] = 'Streptococcus sp.'
                    df.at[position, "Org1.value"] = 'StrepSp'
                    #df.at[position,"OtherOrg1.value"] = None

                #Staphylococcus auris
                if (str(df.at[position,
                              "OtherOrg1.value"]).lower().find("s.aureus") >
                        -1):
                    df.at[position, "Org1.label"] = 'Staphylococcus aureus'
                    df.at[position, "Org1.value"] = 'SA'
                    #df.at[position,"OtherOrg1.value"] = None

                # Citrobacter
                if (str(df.at[position,
                              "OtherOrg1.value"]).lower().find("citrobacter")
                        > -1 or str(df.at[position, "OtherOrg1.value"]).lower(
                        ).find("citribacter") > -1):
                    df.at[position, "Org1.label"] = 'Citrobacter sp.'
                    df.at[position, "Org1.value"] = 'Cit'
                    #df.at[position,"OtherOrg1.value"] = None

                # Proteus
                if (str(df.at[position,
                              "OtherOrg1.value"]).lower().find("proteus") > -1
                        or
                        str(df.at[position,
                                  "OtherOrg1.value"]).lower().find("ptoteus") >
                        -1):
                    df.at[position, "Org1.label"] = 'Proteus sp.'
                    df.at[position, "Org1.value"] = 'Prot'
                    #df.at[position,"OtherOrg1.value"] = None

                # Yeasts excluding candida albicans
                if (str(df.at[position, "OtherOrg1.value"]).lower().find(
                        "yeasts excluding candida albicans") > -1
                        or str(df.at[position,
                                     "OtherOrg1.value"]).lower().find("yeasts")
                        > -1):
                    df.at[position,
                          "Org1.label"] = 'Yeasts (excluding candida)'
                    df.at[position, "Org1.value"] = 'Yea'
                    #df.at[position,"OtherOrg1.value"] = None

                # Enterobacter
                if (str(df.at[position,
                              "OtherOrg1.value"]).lower().find("enterobacter")
                        > -1):
                    df.at[position, "Org1.label"] = 'Enterobacter sp.'
                    df.at[position, "Org1.value"] = 'Ent'
                    #df.at[position,"OtherOrg1.value"] = None

                # Group D streptococcus species
                if (str(df.at[position,
                              "OtherOrg1.value"]).lower().find("group d") >
                        -1):
                    df.at[position, "Org1.label"] = 'Group D Strep'
                    df.at[position, "Org1.value"] = 'GDS'
                    #df.at[position,"OtherOrg1.value"] = None

                # Non-haemolytic strep
                if (str(df.at[position, "OtherOrg1.value"]).lower().find(
                        "non-haemolytic strep") > -1):
                    df.at[position,
                          "Org1.label"] = 'Non haemolytic streptococcus'
                    df.at[position, "Org1.value"] = 'NHS'
                    #df.at[position,"OtherOrg1.value"] = None
                # Non-lactose fermenter
                if (str(df.at[position, "OtherOrg1.value"]).lower().find(
                        "non-haemolytic strep") > -1):
                    df.at[position,
                          "Org1.label"] = 'Non-lactose fermenting coliform'
                    df.at[position, "Org1.value"] = 'NLFC'
                    #df.at[position,"OtherOrg1.value"] = None

                # Pseudomonas aeruginosa
                if (str(df.at[position,
                              "OtherOrg1.value"]).lower().find("pseudomonas") >
                        -1):
                    df.at[position, "Org1.label"] = 'Pseudomonas aeruginosa'
                    df.at[position, "Org1.value"] = 'Pseud'
                    #df.at[position,"OtherOrg1.value"] = None

                # Viridans Streptococci
                if (str(df.at[position,
                              "OtherOrg1.value"]).lower().find("viridans") >
                        -1):
                    df.at[position, "Org1.label"] = 'Viridans streptococcus'
                    df.at[position, "Org1.value"] = 'VirSt'
                    #df.at[position,"OtherOrg1.value"] = None

            else:
                # Remove All White Spaces
                df.at[position,
                      "Org1.label"] = str(df.at[position,
                                                "Org1.label"]).strip()

    except Exception as ex:
        logging.error("Something Happened Cleaning Up Neolab")
        logging.error(formatError(ex))
        sys.exit(1)
def tidy_tables():

    # try:
    #     tuples = fix_duplicate_uid()
    #     duplicate_df = pd.DataFrame(tuples,columns=['id','uid','DateAdmission']);
    #     if not duplicate_df.empty:
    #         unique_uids = duplicate_df['uid'].copy().unique();
           
    #         alphabet = "0A1B2C3D4E5F6789"
    #         for ind in unique_uids:
    #            dup_df = duplicate_df[(duplicate_df['uid'] == str(ind))].copy().reset_index(drop=True)

    #            if not dup_df.empty and len(dup_df)>1:
    #                prev_record = None;
    #                for dup_index, dup in dup_df.iterrows():
    #                    if dup_index >=1 and dup['DateAdmission'] is not None:
    #                        adm_date = str(dup['DateAdmission'])
    #                        prev_adm_date = None
    #                        if prev_record is not None and prev_record['DateAdmission'] is not None:
    #                             prev_adm_date = str(prev_record['DateAdmission'])
    #                        if adm_date == prev_adm_date:
    #                            # RECORD IS A DUPLICATE AND WILL BE DELT WITH DURING DEDUPLICATION PROCESS ON NEXT RUN OF PIPELINE
    #                            pass;
                        
    #                        else:
    #                         #    #GENERATE NEW UID
    #                             uid = '78'.join((random.choice(alphabet)) for x in range(2))+'-'+str(random.randint(1000,9999));
    #                             update_uid('public','sessions',dup['id'],uid); 
    #                    prev_record = dup;    
    #     logging.info("...DONE WITH UPDATE......")
    #     sys.exit()
        
    # except Exception as ex:
    #     raise ex;

    # Read the raw admissions and discharge data into dataframes
    logging.info("... Fetching raw admission and discharge data")
    
    try:
        #Read Admisiions From The Kedro Catalog
        adm_raw = catalog.load('read_admissions');
        #Read Discharges From The Kedro Catalog
        dis_raw = catalog.load('read_discharges');
        #Read Maternal OutComes from Kedro Catalog
        mat_outcomes_raw = catalog.load('read_maternal_outcomes')
        #Read Vital Signs from Kedro Catalog
        vit_signs_raw = catalog.load('read_vital_signs')
        #Read Neo Lab Data from Kedro Catalog
        neolab_raw = catalog.load('read_neolab_data')
        #Read Baseline Data from Kedro Catalog
        baseline_raw = catalog.load('read_baseline_data')
        #Read Diagnoses Data from Kedro Catalog
        diagnoses_raw = catalog.load('read_diagnoses_data') 
        #Read Maternity Completeness Data from Kedro Catalog
        mat_completeness_raw = catalog.load('read_mat_completeness_data') 


    
    except Exception as e:
        logging.error("!!! An error occured fetching the data: ")
        logging.error(formatError(e))

    # Now let's fetch the list of properties recorded in that table
    logging.info("... Extracting keys")
    try:
        
        
        adm_new_entries, adm_mcl = get_key_values(adm_raw)
        dis_new_entries, dis_mcl = get_key_values(dis_raw)
        mat_outcomes_new_entries,mat_outcomes_mcl = get_key_values(mat_outcomes_raw)
        vit_signs_new_entries,vit_signs_mcl = get_key_values(vit_signs_raw)
        neolab_new_entries,noelab_mcl = get_key_values(neolab_raw)
        baseline_new_entries,baseline_mcl = get_key_values(baseline_raw)
        diagnoses_new_entries = get_diagnoses_key_values(diagnoses_raw)
        mat_completeness_new_entries,mat_completeness_mcl = get_key_values(mat_completeness_raw)

    except Exception as e:
        logging.error("!!! An error occured extracting keys: ")
        logging.error(formatError(e))

    # Create the dataframe (df) where each property is pulled out into its own colum
    logging.info(
        "... Creating normalized dataframes - one for admissions and one for discharges")
    try:

        adm_df = pd.json_normalize(adm_new_entries)
        if "uid" in adm_df:
            adm_df.set_index(['uid'])
        dis_df = pd.json_normalize(dis_new_entries)
        if "uid" in dis_df:
            dis_df.set_index(['uid'])
        mat_outcomes_df =pd.json_normalize(mat_outcomes_new_entries)
        if "uid" in mat_outcomes_df:
            mat_outcomes_df.set_index(['uid'])
        vit_signs_df = pd.json_normalize(vit_signs_new_entries)
        if "uid" in vit_signs_df:
            vit_signs_df.set_index(['uid'])
        neolab_df = pd.json_normalize(neolab_new_entries)
       
        baseline_df = pd.json_normalize(baseline_new_entries)
        if "uid" in baseline_df:
            baseline_df.set_index(['uid'])

        diagnoses_df = pd.json_normalize(diagnoses_new_entries)
        # if "uid" in diagnoses_df:
        #     diagnoses_df.set_index(['uid'])

        mat_completeness_df = pd.json_normalize(mat_completeness_new_entries)
        if "uid" in mat_completeness_df:
            mat_completeness_df.set_index(['uid'])

        # INITIALISE THE EPISODE COLUMN ON NEOAB DF SO THAT THE COLUMN GETS CREATED
        
 
        # ADD TIME SPENT TO ALL DFs
        if "started_at" in adm_df and 'completed_at' in adm_df :
            format_date_without_timezone(adm_df,'started_at'); 
            format_date_without_timezone(adm_df,'completed_at'); 
            adm_df['time_spent'] = (adm_df['completed_at'] - adm_df['started_at']).astype('timedelta64[m]')
        else:
            adm_df['time_spent'] = None
        
        if "started_at" in dis_df and 'completed_at' in dis_df :
            format_date_without_timezone(dis_df,'started_at'); 
            format_date_without_timezone(dis_df,'completed_at'); 
            dis_df['time_spent'] = (dis_df['completed_at'] -dis_df['started_at']).astype('timedelta64[m]')
        else:
            dis_df['time_spent'] = None
        
        if "started_at" in mat_outcomes_df and 'completed_at' in mat_outcomes_df :
            format_date_without_timezone(mat_outcomes_df,'started_at'); 
            format_date_without_timezone(mat_outcomes_df,'completed_at'); 
            mat_outcomes_df['time_spent'] = (mat_outcomes_df['completed_at'] - mat_outcomes_df['started_at']).astype('timedelta64[m]')
        else:
            mat_outcomes_df['time_spent'] = None

        if "started_at" in vit_signs_df and 'completed_at' in vit_signs_df :
            format_date_without_timezone(vit_signs_df,'started_at'); 
            format_date_without_timezone(vit_signs_df,'completed_at'); 
            vit_signs_df['time_spent'] = (vit_signs_df['completed_at']-vit_signs_df['started_at']).astype('timedelta64[m]')
        else:
            vit_signs_df['time_spent'] = None
        
        if "started_at" in neolab_df and 'completed_at' in neolab_df :
            format_date_without_timezone(neolab_df,'started_at'); 
            format_date_without_timezone(neolab_df,'completed_at'); 
            neolab_df['time_spent'] = (neolab_df['completed_at'] - neolab_df['started_at']).astype('timedelta64[m]')
        else:
            neolab_df['time_spent'] = None

        if "started_at" in baseline_df and 'completed_at' in baseline_df :
            format_date_without_timezone(baseline_df,'started_at'); 
            format_date_without_timezone(baseline_df,'completed_at'); 
            baseline_df['time_spent'] = (baseline_df['completed_at'] -baseline_df['started_at']).astype('timedelta64[m]')
        else:
            baseline_df['time_spent'] = None
        
        if ("DateBCR.value" in neolab_df and 'DateBCT.value' in neolab_df and 
            neolab_df['DateBCR.value'] is not None and neolab_df['DateBCT.value'] is not None):
            
            neolab_df['BCReturnTime'] = (pd.to_datetime(neolab_df['DateBCR.value'], format='%Y-%m-%dT%H:%M:%S',utc=True).astype('datetime64[ns]') -
                                        pd.to_datetime(neolab_df['DateBCT.value'], format='%Y-%m-%dT%H:%M:%S',utc=True).astype('datetime64[ns]')).astype('timedelta64[h]')
        else:
            neolab_df['BCReturnTime'] = None

        baseline_df['LengthOfStay.value'] = None
        baseline_df['LengthOfStay.label'] = None
        baseline_df['LengthOfLife.value'] = None
        baseline_df['LengthOfLife.label'] = None
        
        #Length of Life and Length of Stay on Baseline Data
        date_format = "%Y-%m-%d"
        for index, row in baseline_df.iterrows():

            baseline_df['LengthOfStay.label'].iloc[index] ="Length of Stay"
            if (is_date(str(row['DateTimeDischarge.value']))
                and is_date(str(row['DateTimeAdmission.value']))):
                DateTimeDischarge = dt.strptime(str(str(row['DateTimeDischarge.value']))[:10].strip().replace('T',''),date_format)
                DateTimeAdmission = dt.strptime(str(str(row['DateTimeAdmission.value']))[:10].strip().replace('T',''),date_format)
                delta_los = DateTimeDischarge-DateTimeAdmission
                baseline_df['LengthOfStay.value'].iloc[index] = delta_los.days

            else:
                baseline_df['LengthOfStay.value'].iloc[index] = None
        
            baseline_df['LengthOfLife.label'].iloc[index] ="Length of Life"
            if 'DateTimeDeath.value' in row and (is_date(str(row['DateTimeDeath.value']))
                and is_date(str(row['DateTimeAdmission.value']))): 
                DateTimeDeath = dt.strptime(str(str(row['DateTimeDeath.value']))[:10].strip().replace('T',''), date_format)
                DateTimeAdmission = dt.strptime(str(str(row['DateTimeAdmission.value']))[:10].strip().replace('T',''), date_format)
                delta_lol = DateTimeDeath - DateTimeAdmission
                baseline_df['LengthOfLife.value'].iloc[index] = delta_lol.days;
            else:
                baseline_df['LengthOfLife.value'].iloc[index] = None;



        # watch out for time zone (tz) issues if you change code (ref: https://github.com/pandas-dev/pandas/issues/25571)
        set_key_to_none(adm_df,'DateHIVtest.value')
        set_key_to_none(adm_df,'DateHIVtest.label')
        set_key_to_none(adm_df,'HIVtestResult.value')
        set_key_to_none(adm_df,'HIVtestResult.label')
        set_key_to_none(adm_df,'ANVDRLDate.value')
        set_key_to_none(adm_df,'ANVDRLDate.label')
        set_key_to_none(adm_df,'HAART.value')
        set_key_to_none(adm_df,'HAART.label')
        set_key_to_none(adm_df,'LengthHAART.value')
        set_key_to_none(adm_df,'LengthHAART.label')
        set_key_to_none(adm_df,'NVPgiven.value')
        set_key_to_none(adm_df,'NVPgiven.label')
        set_key_to_none(adm_df,'DateTimeAdmission.value')
        set_key_to_none(adm_df,'DateTimeAdmission.label')
        set_key_to_none(adm_df,'ROMlength.label')
        set_key_to_none(adm_df,'ROMlength.value')
        set_key_to_none(adm_df,'ROMLength.label')
        set_key_to_none(adm_df,'ROMLength.value')

        #Format Dates Admissions Tables
        format_date(adm_df,'DateTimeAdmission.value')
        format_date(adm_df,'EndScriptDatetime.value')
        format_date(adm_df,'DateHIVtest.value')
        format_date(adm_df,'ANVDRLDate.value')

        #Format Dates Discharge Table
        format_date(dis_df,'DateAdmissionDC.value')  
        format_date(dis_df,'DateDischVitals.value')
        format_date(dis_df,'DateDischWeight.value')
        format_date(dis_df,'DateTimeDischarge.value')
        format_date(dis_df,'EndScriptDatetime.value')
        format_date(dis_df,'DateWeaned.value')
        format_date(dis_df,'DateTimeDeath.value')
        format_date(dis_df,'DateAdmission.value')
        format_date(dis_df,'BirthDateDis.value')
        format_date(dis_df,'DateHIVtest.value')
        format_date(dis_df,'DateVDRLSameHIV.value')

        # Maternal Outcomes
        set_key_to_none(mat_outcomes_df,'TypeBirth.label')
        set_key_to_none(mat_outcomes_df,'Presentation.label')
        set_key_to_none(mat_outcomes_df,'BabyNursery.label')
        set_key_to_none(mat_outcomes_df,'Reason.label')
        set_key_to_none(mat_outcomes_df,'ReasonOther.label')
        set_key_to_none(mat_outcomes_df,'CryBirth.label')
        set_key_to_none(mat_outcomes_df,'Apgar1.value')
        set_key_to_none(mat_outcomes_df,'Apgar5.value') 
        set_key_to_none(mat_outcomes_df,'Apgar10.value')
        set_key_to_none(mat_outcomes_df,'PregConditions.label')
        set_key_to_none(mat_outcomes_df,'BirthDateDis.value')

        # Baselines Tables
        format_date(baseline_df,'DateTimeAdmission.value')
        format_date(baseline_df,'DateTimeDischarge.value')
        format_date(baseline_df,'DateTimeDeath.value')

        set_key_to_none(baseline_df,'AWGroup.value')
        set_key_to_none(baseline_df,'BWGroup.value') 
        #Vital Signs Table
        format_date(vit_signs_df,'D1Date.value')
        format_date(vit_signs_df,'TimeTemp1.value')
        format_date(vit_signs_df,'TimeTemp2.value')
        format_date(vit_signs_df,'EndScriptDatetime.value')
        
        # CREATE AGE CATEGORIES

        if not adm_df.empty:
           
            for position,admission in adm_df.iterrows():

                age_list =[]
                period = 0

                if 'Age.value' in admission and str(admission['Age.value']).isdigit():
                    period = admission['Age.value']
                else:
                    if 'Age.value' in admission and str(admission['Age.value']) != 'nan':
                    # Get The Value which is a string e.g  3 days, 4 hours
                        age_list = str(admission['Age.value']).split(",")
                    else:
                        if 'AgeB.value' in admission and str(admission['AgeB.value']) != 'nan':
                            age_list = str(admission['AgeB.value']).split(",")
                    # Initialise Hours
                    hours = 0
                    
                    # If size of List is 1 it either means its days only or hours only
                
                    if len(age_list) == 1:
                        age = age_list[0]
                        # Check if hours or Days
                        if 'hour' in age:
                        
                            hours= [int(s) for s in age.replace("-","").split() if s.isdigit()]
                            # Check if value contains figures
                            if len(hours) >0:
                                period = hours[0]
                            else:
                                if "an" in age:
                                    # IF AN HOUR 
                                    period = 1

                        elif 'day' in age:
                            hours = [int(s) for s in age.replace("-","").split() if s.isdigit()]
                            if len(hours) >0:
                                period = hours[0] * 24
                        elif 'second' in age:
                            # FEW SECONDS CAN BE ROUNDED OFF 1 HOUR
                            period = 1
                        elif 'minute' in age:
                            # MINUTES CAN BE ROUNDED OFF 1 HOUR
                            period = 1
                            pass;     
                    # Contains Both Hours and Days        
                    elif len(age_list) == 2:
                        age_days = age_list[0]
                        age_hours = age_list[1]
                        if 'day' in age_days and 'hour' in age_hours:
                            number_hours_days= [int(s) for s in age_days.split() if s.isdigit()]
                            number_hours = [int(s) for s in age_hours.split() if s.isdigit()]
                            if (len(number_hours) >0 and len(number_hours_days)>0):
                                period = (number_hours_days[0]) * 24 +(number_hours[0])

                    else:
                        pass;  

                if period>0:
                    adm_df.loc[position,'Age.value'] = period
                    if period< 2:
                        adm_df.loc[position,'AgeCategory'] = 'Fresh Newborn (< 2 hours old)'
                    elif period>2 and period<=23:
                        adm_df.loc[position,'AgeCategory'] = 'Newborn (2 - 23 hrs old)'
                    elif period>23 and period<=47:
                        adm_df.loc[position,'AgeCategory']= 'Newborn (1 day - 1 day 23 hrs old)'
                    elif period>47 and period<= 71:
                        adm_df.loc[position,'AgeCategory']= 'Infant (2 days - 2 days 23 hrs old)' 
                    else:
                        adm_df.loc[position,'AgeCategory'] = 'Infant (> 3 days old)' 
                ########################## UPDATE ADMISSION SCRIPT WITH NEW KEYS ########################
                if  "BirthWeight.value" in admission and str(admission["BirthWeight.value"]) != 'nan' and admission["BirthWeight.value"] is not None:
                    pass;
                else:
                    key_change(adm_df,admission,position,'BW.value','BirthWeight.value')
                if "Convulsions.value" in admission and str(admission["Convulsions.value"]) != 'nan' and admission["Convulsions.value"] is not None:
                    pass;
                else:
                    key_change(adm_df,admission,position,'Conv.value','Convulsions.value')
                if ('SymptomReviewNeurology.value' in admission and str(admission["SymptomReviewNeurology.value"]) != 'nan' 
                    and admission["SymptomReviewNeurology.value"] is not None):
                    pass;
                else:
                    key_change(adm_df,admission,position,'SRNeuroOther.value','SymptomReviewNeurology.value')
                if 'LowBirthWeight.value' in admission and str(admission["LowBirthWeight.value"]) !='nan' and admission["LowBirthWeight.value"] is not None:
                    pass;
                else:
                    key_change(adm_df,admission,position,'LBW.value','LowBirthWeight.value')
                if 'AdmissionWeight.value' in admission and str(admission["AdmissionWeight.value"]) != 'nan' and admission["AdmissionWeight.value"] is not None :
                    pass;
                else:
                    key_change(adm_df,admission,position,'AW.value','AdmissionWeight.value')
                #Fix differences in Column data type definition
                if 'BSUnitmg.value' in admission and str(admission["BSUnitmg.value"]) !='nan' and admission["BSUnitmg.value"] is not None:
                    pass;
                else:
                    key_change(adm_df,admission,position,'BSmgdL.value','BSUnitmg.value')
                if 'BSmmol.value' in admission and str(admission["BSmmol.value"])!='nan' and admission["BSmmol.value"] is not None:
                    
                    key_change(adm_df,admission,position,'BSmmol.value','BloodSugarmmol.value');

                if 'BSmg.value' in admission and str(admission["BSmg.value"])!='nan' and admission["BSmg.value"] is not None:
                    key_change(adm_df,admission,position,'BSmg.value','BloodSugarmg.value')
                    
                if  "ROMlength.value" in admission and str(admission["ROMlength.value"]) != 'nan' and admission["ROMlength.value"] is not None:
                    key_change(adm_df,admission,position,'ROMlength.value','ROMLength.value');
                
                if  "ROMlength.label" in admission and str(admission["ROMlength.label"]) != 'nan' and admission["ROMlength.label"] is not None:
                    key_change(adm_df,admission,position,'ROMlength.label','ROMLength.label');

            if "Age.value" in adm_df:
                adm_df['Age.value'] = pd.to_numeric(adm_df['Age.value'], errors='coerce')
            if 'AdmissionWeight.value' in adm_df:
                 adm_df['AdmissionWeight.value'] = pd.to_numeric(adm_df['AdmissionWeight.value'], errors='coerce')
            if 'BirthWeight.value' in adm_df:
                adm_df['BirthWeight.value'] = pd.to_numeric(adm_df['BirthWeight.value'], errors='coerce')

        if not dis_df.empty:
            for position,discharge in dis_df.iterrows():
                if 'BirthWeight.value' in discharge and str(discharge['BirthWeight.value'])!='nan' and discharge['BirthWeight.value'] is not None:
                    pass;
                else:
                    key_change(dis_df,discharge,position,'BWTDis.value','BirthWeight.value')
                if 'DOBTOB.value' in discharge and str(discharge['DOBTOB.value'])!='nan' and discharge['DOBTOB.value'] is not None:
                    pass;
                else:
                    key_change(dis_df,discharge,position,'BirthDateDis.value','DOBTOB.value')
                if 'ModeDelivery.value' in discharge and str(discharge['ModeDelivery.value'])!='nan' and discharge['ModeDelivery.value'] is not None:
                    pass;
                else:
                    key_change(dis_df,discharge,position,'Delivery.value','ModeDelivery.value')
                if 'Temperature.value' in discharge and str(discharge['Temperature.value'])!='nan' and discharge['Temperature.value'] is not None:
                    pass;
                else: 
                    key_change(dis_df,discharge,position,'NNUAdmTemp.value','Temperature.value') 
                if  'Gestation.value' in discharge and str(discharge['Gestation.value'])!='nan' and discharge['Gestation.value'] is not None:
                    pass;
                else:
                    key_change(dis_df,discharge,position,'GestBirth.value','Gestation.value')
                if 'AdmReason.value' in discharge and str(discharge['AdmReason.value'])!='nan' and discharge['AdmReason.value'] is not None:
                    pass;
                else:
                    key_change(dis_df,discharge,position,'PresComp.value','AdmReason.value')
       
        # Join Maternal Completeness and Maternal Outcomes /A Case For Malawi
        if not mat_outcomes_df.empty and not mat_completeness_df.empty: 
               latest_mat_outcomes_df = mat_outcomes_df[pd.to_datetime(mat_outcomes_df['DateAdmission.value']) >='2021-10-01']
               previous_mat_outcomes_df = mat_completeness_df[pd.to_datetime(mat_completeness_df['DateAdmission.value']) <='2021-09-30']
               mat_completeness_df = pd.concat([latest_mat_outcomes_df, previous_mat_outcomes_df], ignore_index=True)

        # Create Episode Column for Neolab Data
        if not neolab_df.empty:
            # Initialise the column
            neolab_df['episode'] = 0
            # Initialise BCR TYPE
            neolab_df['BCType']= None
            neolab_df['DateBCT.value']=pd.to_datetime(neolab_df['DateBCT.value'])
       
            for index,row in neolab_df.iterrows():
                # Data Cleaning
                neolab_cleanup(neolab_df,index)  
                #Set Episodes
                control_df = neolab_df[neolab_df['uid'] == row['uid']].copy().sort_values(by=['DateBCT.value']).reset_index(drop=True)
                if not control_df.empty:
                    episode =1;
                    if neolab_df.at[index,'episode'] ==0:
                        for innerIndex, innerRow in control_df.iterrows() :
                            
                            if innerIndex == 0:
                            #Episode Remains 1 
                                pass;
                            else:
                                control_df_date_bct = control_df.at[innerIndex,'DateBCT.value']
                                prev_control_df_date_bct = control_df.at[innerIndex-1,'DateBCT.value']
                                if len(str(control_df_date_bct)) >9 and len(str(prev_control_df_date_bct)) > 9 :
                                    if str(control_df_date_bct)[:10] == str(prev_control_df_date_bct)[:10]:
                                        # Episode Remains the same as previous Episode
                                        pass;
                                    
                                    else:
                                        episode = episode+1;
                            # Set The Episode Value For All Related Episodes in the Main DF 
                            control_df.loc[innerIndex,'episode']= episode;
                            neolab_df.loc[(neolab_df['uid']
                                ==control_df.at[innerIndex,'uid']) & (neolab_df['DateBCT.value']
                                ==control_df.at[innerIndex,'DateBCT.value']) & (neolab_df['DateBCR.value']
                                == control_df.at[innerIndex,'DateBCR.value']),'episode'] = episode                              

                        #Add BCR TYPE TO CONTROL DF
                        # Loop is necessary since BCType is dependant on the set episodes

                        for control_index, bct_row in control_df.iterrows() :  
                            bct_type_df = control_df[(control_df['uid'] == bct_row['uid']) & (control_df['episode'] == bct_row['episode'])].copy().sort_values(by=['DateBCR.value']).reset_index(drop=True)
                            
                            if not bct_type_df.empty:
                                preliminary_index= 1;
                                for bct_index, row in bct_type_df.iterrows():
                                    bct_value = None;
                                    bct_values_from_df = neolab_df.loc[(neolab_df['uid']
                                            ==bct_type_df.at[bct_index,'uid']) & (neolab_df['DateBCT.value']
                                            ==bct_type_df.at[bct_index,'DateBCT.value']) & (neolab_df['DateBCR.value']
                                            == bct_type_df.at[bct_index,'DateBCR.value'])]['BCType'].values
                                    if len(bct_values_from_df) >0:
                                        bct_value = bct_values_from_df[0]

                                    if bct_value is None:
                                        if (bct_type_df.at[bct_index,'BCResult.value'] != 'Pos' and bct_type_df.at[bct_index,'BCResult.value'] != 'Neg'
                                            and bct_type_df.at[bct_index,'BCResult.value'] != 'PC'):
                                            bct_type_df.loc[bct_index,'BCType'] = "PRELIMINARY-"+str(preliminary_index);
                                            preliminary_index=preliminary_index+1
        
                                        else:
                                            if bct_index == len(bct_type_df)-1:
                                                bct_type_df.loc[bct_index,'BCType'] = "FINAL";
                                            else:
                                                bct_type_df.loc[bct_index,'BCType'] = "PRELIMINARY-"+str(preliminary_index);
                                                preliminary_index = preliminary_index+1;

                                    # Set The BCR Type For All Related Records in the Main DFclear
                                    if bct_type_df.at[bct_index,'BCType'] is not None:
                                        neolab_df.loc[(neolab_df['uid']
                                            ==bct_type_df.at[bct_index,'uid']) & (neolab_df['DateBCT.value']
                                            ==bct_type_df.at[bct_index,'DateBCT.value']) & (neolab_df['DateBCR.value']
                                            == bct_type_df.at[bct_index,'DateBCR.value']),'BCType'] = bct_type_df.at[bct_index,'BCType']

        # Make changes to admissions and baseline data to match fields in power bi                                    
        if not adm_df.empty:
            adm_df = create_columns(adm_df)
        if not baseline_df.empty:
            baseline_df = create_columns(baseline_df)

    except Exception as e:
        logging.error(
            "!!! An error occured normalized dataframes/changing data types: ")
        logging.error(formatError(e))

    # Now write the cleaned up admission and discharge tables back to the database
    logging.info(
        "... Writing the tidied admission and discharge back to the database")
    try:
       
    
        #Save Derived Admissions To The DataBase Using Kedro
        if not adm_df.empty:
            adm_df.columns = adm_df.columns.str.replace(r"[()-]", "_")
            catalog.save('create_derived_admissions',adm_df)
        #Save Derived Admissions To The DataBase Using Kedro
        if not dis_df.empty:
            catalog.save('create_derived_discharges',dis_df)
        #Save Derived Maternal Outcomes To The DataBase Using Kedro
        if not mat_outcomes_df.empty:
            catalog.save('create_derived_maternal_outcomes',mat_outcomes_df)
         #Save Derived Vital Signs To The DataBase Using Kedro
        if not vit_signs_df.empty:
            catalog.save('create_derived_vital_signs',vit_signs_df)
        #Save Derived NeoLab To The DataBase Using Kedro
        if not neolab_df.empty:
            #SET INDEX 
            if "uid" in neolab_df:
                neolab_df.set_index(['uid'])
                if ("episode" in neolab_df):
                    neolab_df.sort_values(by=['uid','episode'])  
            catalog.save('create_derived_neolab',neolab_df)
        #Save Derived Baseline To The DataBase Using Kedro
        if not baseline_df.empty:
            catalog.save('create_derived_baselines',baseline_df)

         #Save Derived Diagnoses To The DataBase Using Kedro
        if not diagnoses_df.empty:
            catalog.save('create_derived_diagnoses',diagnoses_df)

         #Save Derived Maternity Completeness To The DataBase Using Kedro
        if not mat_completeness_df.empty:
            catalog.save('create_derived_maternity_completeness',mat_completeness_df)


    except Exception as e:
        logging.error(
            "!!! An error occured writing admissions and discharge output back to the database: ")
        logging.error(formatError(e))

    logging.info("... Creating MCL count tables")
    try:
        if not adm_df.empty:
            explode_column(adm_df, adm_mcl,"")
        if not dis_df.empty:
            explode_column(dis_df, dis_mcl,"disc_")
        if not mat_outcomes_df.empty:
            explode_column(mat_outcomes_df,mat_outcomes_mcl,"mat_")
        if not vit_signs_df.empty:
            explode_column(vit_signs_df,vit_signs_mcl,"vit_")

        if not baseline_df.empty:
            explode_column(baseline_df,baseline_mcl,"bsl_")
        
        if not mat_completeness_df.empty:
            explode_column(mat_completeness_df,mat_completeness_mcl,"matcomp_")
       
    except Exception as e:
        logging.error("!!! An error occured exploding MCL  columns: ")
        logging.error(formatError(e))