def mark_duplicates(cls, df_spark, duplicate_dict): """ Mark each row if duplicates found based on a primary key @input spark -- SparkSession filename -- string df_spark -- dictionary of spark dataframe duplicate_dict -- dictionary of primary keys for each table @output Spark Dataframe """ QC_LOG_FORMAT = '%(rule)s - %(total)s - %(count)s - %(percent)s' for filename in duplicate_dict: list = duplicate_dict[filename] df_spark[filename] = df_spark[filename].withColumn( "count", SQL.count(list[0]).over(Window.partitionBy(list))).withColumn( "flag_duplication", SQL.when(SQL.col("count") > 1, 1).otherwise(0)).drop("count") dup_total = df_spark[filename].count() dup_count = df_spark[filename].where( SQL.col("flag_duplication") == 1).count() dup_percent = round(float(dup_count) / dup_total * 100, 2) message = "mark_duplicate - {} - {} - {} - {}".format( filename, dup_total, dup_count, dup_percent) log.info(message) return df_spark
def qc_lastdate(cls, dfs, last_dt): """ Check if there are date values in that are greater than last possible date for a patient based on last_dt specs. """ print("\nCheck dates that come after last date values...") output_dict = {} dt_type = 'datetime64[ns]' if last_dt['dt_type'] == 'float': dt_type = 'float64' lastdt_df = dfs[last_dt['table']][[last_dt['id_column'], last_dt['dt_column']]].drop_duplicates() lastdt_df[last_dt['dt_column']] = lastdt_df[last_dt['dt_column']].astype(dt_type) for name, df in dfs.items(): df = dfs[name].copy() if not last_dt["ignore_value"]: cols = [c for c in df.columns if last_dt["dt_value"] in c] else: cols = [c for c in df.columns if last_dt["dt_value"] in c and last_dt["ignore_value"] not in c] if last_dt['dt_column'] in cols: cols.remove(last_dt['dt_column']) df = df.drop(columns=last_dt['dt_column'], axis=1) if last_dt['ignorecols'] is not None: for col in last_dt['ignorecols']: if col in cols: cols.remove(col) df = df.drop(columns=col, axis=1) if not cols: continue df[cols] = df[cols].replace('UK', '15', regex=True) for col in cols: df[col] = df[col].astype(dt_type) inter_df = df.merge(lastdt_df, how='outer', on=last_dt['id_column']) querystring = inequality_dt_query(last_dt['dt_column'], cols, '<', '|') df_last_dt_obs = inter_df.query(querystring, engine='python').assign(last_dt_flag=1) dfs[name] = inter_df.merge(df_last_dt_obs, how='outer', on=list(inter_df.columns)) dfs[name]["last_dt_flag"] = dfs[name]["last_dt_flag"].fillna(0) dfs[name] = dfs[name].drop(columns=[last_dt['dt_column']], axis=1) if df_last_dt_obs.shape[0] != 0: print("\t Table: ", name, ">>>FAIL") output_dict[name] = df_last_dt_obs csv_name = 'qc_lastdt_' + name + '.csv' report_path = os.path.join(cls.output_loc, csv_name) file_to_write = pdToStringIO(df_last_dt_obs) DataIO.write(DataIO.RepoType.LOCAL, 'file://' + report_path, content=file_to_write, query=None, username=None, password=None, write_type=DataIO.WriteType.NEW) #f_last_dt_obs.to_csv(report_path, index=False) message = "qc_lastdate - {} - {}".format(name, report_path) log.info(message) print("report location for {}: {}".format(name, report_path)) print("...Done") return dfs
def qc_dup(cls, dfs, dup_dict): """ Check duplications for specific columns. Duplication is defined as multiple observations on the same day same patient. :param dict dfs: A dictionary that stores dataframe as the value and name as the key. :param dict dup_dict: Specify which tables and columns to check. Table name is as key and the list of duplicated keys as values. :return: print summary and csv output :example: dup_dict = { 'Stem_Cell_Transplant': ['PATCOD','SCTYN','SCTDAT'], 'Serum_M-Protein': ['PATCOD','LBDAT']} """ print("\nCheck duplicated values...") output_dict = {} report_dict = {} for name, cols in dup_dict.items(): print("\t Table: ", name) df = dfs[name] df_dup_cts = return_multiple_records(df, cols) rule_obj = UniquePKRule(dfs[name], cols) rule_obj.result_message(rule_obj.validation_result) rule_event_obj = rule_obj.result_to_event("QC_Engine", "001") if df_dup_cts.shape[0] != 0: print("\t >>>FAIL") key = name + ' : ' + ','.join(cols) df_dup_obs = df_dup_cts.merge(df, how='inner', on=cols) final_df = df_dup_cts.rename(columns={"counts": "dup_count"}).assign(dup_flag=1).merge(df, how='outer', on=cols).fillna(0) dup_total = final_df['dup_flag'].sum() total = len(final_df.index) dup_percent = round(float(dup_total) / total * 100, 2) report_dict[name] = df_dup_obs output_dict[name] = final_df dfs[name] = final_df csv_name = 'qc_dup_' + name + '.csv' report_path = os.path.join(cls.output_loc, csv_name) file_to_write = pdToStringIO(df_dup_obs) DataIO.write(DataIO.RepoType.LOCAL, 'file://'+report_path, content=file_to_write, query=None, username=None, password=None, write_type=DataIO.WriteType.NEW) #df_dup_obs.to_csv(report_path, index=False) message = "mark_duplicate - {} - {} - {} - {} - {}".format(name, total, dup_total, dup_percent, report_path) log.info(message) print("report location for {}: {}".format(name, report_path)) print("...Done") return dfs
def qc_string_in_num(cls, dfs, string_tables, outdir=None): """ Check if there are string values in numeric fields. If there are NAs in the filed the function doesn't count. """ print("\nCheck string values in numeric fields...") output_list = [] string_dict = {} for name, df in dfs.items(): print("\n" + name) cols = [c for c in df.columns if string_tables["value"] in c] print(cols) for name, df in dfs.items(): df = dfs[name].copy() cols = [c for c in df.columns if string_tables["value"] in c] string_dict[name] = cols if not cols: continue for col in cols: col_num = col + '_num' df[col_num] = df[col] bool_df = df[col_num].astype(str).apply(lambda v: is_not_positive_digit(v)) df[col_num] = bool_df strs = df.loc[bool_df, col].unique() strs = [s for s in strs if s == s] if len(strs) != 0: output_list.append([name, col, strs]) cols_num = (s + '_num' for s in cols) df = df[df.select_dtypes([bool]).any(1)].drop(columns=cols_num, axis=1).assign(value_flag=1) dfs[name] = dfs[name].merge(df, how='outer', on=list(dfs[name].columns)) dfs[name]["value_flag"] = dfs[name]["value_flag"].fillna(0) output_df = pd.DataFrame( output_list, columns=['tbl', 'column', 'strings']) report_path = os.path.join(cls.output_loc, 'qc_value.csv') file_to_write = pdToStringIO(output_df) DataIO.write(DataIO.RepoType.LOCAL, 'file://' + report_path, content=file_to_write, query=None, username=None, password=None, write_type=DataIO.WriteType.NEW) #output_df.to_csv(report_path, index=False) message = "check_value - {}".format(report_path) log.info(message) print("report location: {}".format(report_path)) print("...Done") return dfs
def qc_missing(cls, dfs, missing_dict=None, outdir=None): """ Return the summary of missing values for all tables, all columns """ print("\nCheck missing values...") df_list = [] output_dict = {} for name, df in dfs.items(): df_cols = list(df.columns) if name in missing_dict: cols = missing_dict[name] else: cols = df_cols filterdf = df.replace(r'^\s*$', np.nan, regex=True).dropna(axis=0, how='any', subset=cols).assign(missing_flag=0) filterdf = filterdf.merge(df, how='outer', on=df_cols) filterdf["missing_flag"] = filterdf["missing_flag"].fillna(1) output_dict[name] = filterdf df = (check_missing_per_tbl(df, cols) .reset_index(drop=False) .assign(tbl=name) .rename(columns={'index': 'column'})) df_list.append(df) df_qc_missing = pd.concat(df_list) report_path = os.path.join(cls.output_loc, 'qc_missing.csv') file_to_write = pdToStringIO(df_qc_missing) DataIO.write(DataIO.RepoType.LOCAL, 'file://' + report_path, content=file_to_write, query=None, username=None, password=None, write_type=DataIO.WriteType.NEW) #df_qc_missing.to_csv(report_path, index=False) message = "mark_missing - {}".format(report_path) log.info(message) print("report location: {}".format(report_path)) print("...Done") return output_dict