def process_master_patient_table(table: Table, engine: Engine, progargs: Any) -> None: crate_col_nhs_number = Column(CRATE_COL_NHS_NUMBER, BigInteger, nullable=True) table.append_column(crate_col_nhs_number) add_columns(engine, table, [crate_col_nhs_number]) if progargs.rcep: nhscol = RCEP_COL_NHS_NUMBER else: nhscol = RIO_COL_NHS_NUMBER log.info("Table {}: updating column {}".format(repr(table.name), repr(nhscol))) ensure_columns_present(engine, tablename=table.name, column_names=[nhscol]) if not progargs.print: ensure_columns_present(engine, tablename=table.name, column_names=[CRATE_COL_NHS_NUMBER]) execute( engine, """ UPDATE {tablename} SET {nhs_number_int} = CAST({nhscol} AS BIGINT) WHERE {nhs_number_int} IS NULL """.format( tablename=table.name, nhs_number_int=CRATE_COL_NHS_NUMBER, nhscol=nhscol, ))
def process_nonpatient_table(table: Table, engine: Engine, progargs: Any) -> None: if progargs.rcep: return log.info("Preprocessing non-patient table {}".format(repr(table.name))) pk_col = get_effective_int_pk_col(table) other_pk_col = pk_col if pk_col != CRATE_COL_PK else None if other_pk_col: # table has a primary key already crate_pk_col = Column(CRATE_COL_PK, BigInteger, nullable=True) else: crate_pk_col = make_bigint_autoincrement_column( CRATE_COL_PK, engine.dialect) table.append_column(crate_pk_col) # must be Table-bound, as above add_columns(engine, table, [crate_pk_col]) if not progargs.print: ensure_columns_present(engine, tablename=table.name, column_names=[CRATE_COL_PK]) if other_pk_col: execute( engine, """ UPDATE {tablename} SET {crate_pk} = {rio_pk} WHERE {crate_pk} IS NULL """.format(tablename=table.name, crate_pk=CRATE_COL_PK, rio_pk=other_pk_col)) add_indexes(engine, table, [{ 'index_name': CRATE_IDX_PK, 'column': CRATE_COL_PK, 'unique': True }])
def add_geography_to_view(columns: List[str], viewmaker: ViewMaker, engine: Engine, configoptions: PcmisConfigOptions) -> None: """ Modifies a viewmaker to add geography columns to views on PCMIS tables. For example, if you start with an address table including postcodes, and you're building a view involving it, then you can link in LSOA or IMD information with this function. Args: columns: column names from the postcode table to include viewmaker: a :class:`crate_anon.common.sql.ViewMaker`, which will be modified. The base table is taken from ``viewmaker.basetable``. engine: an SQLAlchemy Engine configoptions: an instance of :class:`PcmisConfigOptions` """ postcode_alias_1 = "_postcodetable1" postcode_alias_2 = "_postcodetable2" prev_prefix = "previous_" columns_lower = [c.lower() for c in columns] basetable = viewmaker.basetable ensure_columns_present( engine, tablename=basetable, column_names=[PCMIS_COL_POSTCODE, PCMIS_COL_PREV_POSTCODE]) for gc in sorted(configoptions.geogcols, key=lambda x: x.lower()): if gc in columns_lower: raise ValueError(f"Geography column {gc!r} " f"clashes with an existing column") viewmaker.add_select(f"{postcode_alias_1}.{gc} AS {gc}") viewmaker.add_select(f"{postcode_alias_2}.{gc} AS {prev_prefix}{gc}") # PCMIS can have either 'XX99 9XX' or 'XX999XX' format: viewmaker.add_from( f"LEFT JOIN {configoptions.postcodedb}.{ONSPD_TABLE_POSTCODE} " f"AS {postcode_alias_1} " f"ON REPLACE({basetable}.{PCMIS_COL_POSTCODE}, ' ', '') = " f"{postcode_alias_1}.pcd_nospace") viewmaker.add_from( f"LEFT JOIN {configoptions.postcodedb}.{ONSPD_TABLE_POSTCODE} " f"AS {postcode_alias_2} " f"ON REPLACE({basetable}.{PCMIS_COL_POSTCODE}, ' ', '') = " f"{postcode_alias_2}.pcd_nospace")
def add_geography_to_view( basetable: str, columns: List[str], viewmaker: ViewMaker, # viewmaker modified engine: Engine, progargs: Any) -> None: postcode_alias_1 = "_postcodetable1" postcode_alias_2 = "_postcodetable2" prev_prefix = "previous_" columns_lower = [c.lower() for c in columns] ensure_columns_present( engine, tablename=basetable, column_names=[PCMIS_COL_POSTCODE, PCMIS_COL_PREV_POSTCODE]) for gc in sorted(progargs.geogcols, key=lambda x: x.lower()): if gc in columns_lower: raise ValueError("Geography column {} clashes with an existing " "column".format(repr(gc))) viewmaker.add_select("{postcode_alias_1}.{gc} AS {gc}".format( postcode_alias_1=postcode_alias_1, gc=gc)) viewmaker.add_select( "{postcode_alias_2}.{gc} AS {prev_prefix}{gc}".format( postcode_alias_2=postcode_alias_2, gc=gc, prev_prefix=prev_prefix)) # PCMIS can have either 'XX99 9XX' or 'XX999XX' format: viewmaker.add_from( "LEFT JOIN {pdb}.{pcdtab} AS {postcode_alias_1} " "ON REPLACE({basetable}.{PCMIS_COL_POSTCODE}, ' ', '') = " "{postcode_alias_1}.pcd_nospace".format( pdb=progargs.postcodedb, pcdtab=ONSPD_TABLE_POSTCODE, postcode_alias_1=postcode_alias_1, basetable=basetable, PCMIS_COL_POSTCODE=PCMIS_COL_POSTCODE, )) viewmaker.add_from( "LEFT JOIN {pdb}.{pcdtab} AS {postcode_alias_2} " "ON REPLACE({basetable}.{PCMIS_COL_POSTCODE}, ' ', '') = " "{postcode_alias_2}.pcd_nospace".format( pdb=progargs.postcodedb, pcdtab=ONSPD_TABLE_POSTCODE, postcode_alias_2=postcode_alias_2, basetable=basetable, PCMIS_COL_POSTCODE=PCMIS_COL_POSTCODE, ))
def process_table(table: Table, engine: Engine, configoptions: PcmisConfigOptions) -> None: """ Processes a PCMIS table by checking it has appropriate columns, perhaps adding a CRATE integer PK, and indexing it. Args: table: an SQLAlchemy Table to process engine: an SQLAlchemy Engine configoptions: an instance of :class:`PcmisConfigOptions` """ tablename = table.name column_names = table.columns.keys() log.debug(f"TABLE: {tablename}; COLUMNS: {column_names}") existing_pk_cols = get_pk_colnames(table) assert len(existing_pk_cols) < 2, ( f"Table {tablename} has >1 PK column; don't know what to do") if existing_pk_cols and not get_effective_int_pk_col(table): raise ValueError(f"Table {table!r} has a non-integer PK") adding_crate_pk = not existing_pk_cols required_cols = [CRATE_COL_PK] if not configoptions.print_sql_only else [] if configoptions.drop_not_create: # --------------------------------------------------------------------- # DROP STUFF! Opposite order to creation (below) # --------------------------------------------------------------------- drop_indexes(engine, table, [CRATE_IDX_PK]) drop_columns(engine, table, [CRATE_COL_PK]) else: # --------------------------------------------------------------------- # CREATE STUFF! # --------------------------------------------------------------------- # SQL Server requires Table-bound columns in order to generate DDL: if adding_crate_pk: crate_pk_col = make_bigint_autoincrement_column( CRATE_COL_PK, engine.dialect) table.append_column(crate_pk_col) add_columns(engine, table, [crate_pk_col]) ensure_columns_present(engine, tablename=table.name, column_names=required_cols) add_indexes(engine, table, [{ 'index_name': CRATE_IDX_PK, 'column': CRATE_COL_PK, 'unique': True }])
def process_table(table: Table, engine: Engine, progargs: Any) -> None: tablename = table.name column_names = table.columns.keys() log.debug("TABLE: {}; COLUMNS: {}".format(tablename, column_names)) existing_pk_cols = get_pk_colnames(table) assert len(existing_pk_cols) < 2, ( "Table {} has >1 PK column; don't know what to do".format(tablename)) if existing_pk_cols and not get_effective_int_pk_col(table): raise ValueError("Table {} has a non-integer PK".format(repr(table))) adding_crate_pk = not existing_pk_cols required_cols = [CRATE_COL_PK] if not progargs.print else [] if progargs.drop_danger_drop: # --------------------------------------------------------------------- # DROP STUFF! Opposite order to creation (below) # --------------------------------------------------------------------- drop_indexes(engine, table, [CRATE_IDX_PK]) drop_columns(engine, table, [CRATE_COL_PK]) else: # --------------------------------------------------------------------- # CREATE STUFF! # --------------------------------------------------------------------- # SQL Server requires Table-bound columns in order to generate DDL: if adding_crate_pk: crate_pk_col = make_bigint_autoincrement_column( CRATE_COL_PK, engine.dialect) table.append_column(crate_pk_col) add_columns(engine, table, [crate_pk_col]) ensure_columns_present(engine, tablename=table.name, column_names=required_cols) add_indexes(engine, table, [{ 'index_name': CRATE_IDX_PK, 'column': CRATE_COL_PK, 'unique': True }])
def add_postcode_geography_view(engine: Engine, progargs: Any, ddhint: DDHint) -> None: # ddhint modified # Re-read column names, as we may have inserted some recently by hand that # may not be in the initial metadata. if progargs.rio: addresstable = RIO_TABLE_ADDRESS rio_postcodecol = RIO_COL_POSTCODE else: addresstable = RCEP_TABLE_ADDRESS rio_postcodecol = RCEP_COL_POSTCODE orig_column_names = get_column_names(engine, tablename=addresstable, sort=True) # Remove any original column names being overridden by new ones. # (Could also do this the other way around!) geogcols_lowercase = [x.lower() for x in progargs.geogcols] orig_column_names = [ x for x in orig_column_names if x.lower() not in geogcols_lowercase ] orig_column_specs = [ "{t}.{c}".format(t=addresstable, c=col) for col in orig_column_names ] geog_col_specs = [ "{db}.{t}.{c}".format(db=progargs.postcodedb, t=ONSPD_TABLE_POSTCODE, c=col) for col in sorted(progargs.geogcols, key=lambda x: x.lower()) ] overlap = set(orig_column_names) & set(progargs.geogcols) if overlap: raise ValueError("Columns overlap: address table contains columns {}; " "geogcols = {}; overlap = {}".format( orig_column_names, progargs.geogcols, overlap)) ensure_columns_present(engine, tablename=addresstable, column_names=[rio_postcodecol]) select_sql = """ SELECT {origcols}, {geogcols} FROM {addresstable} LEFT JOIN {pdb}.{pcdtab} ON {addresstable}.{rio_postcodecol} = {pdb}.{pcdtab}.pcds -- RCEP, and presumably RiO, appear to use the ONS pcds format, of -- 2-4 char outward code; space; 3-char inward code. -- If this fails, use this slower version: -- ON REPLACE({addresstable}.{rio_postcodecol}, -- ' ', -- '') = {pdb}.{pcdtab}.pcd_nospace """.format( addresstable=addresstable, origcols=",\n ".join(orig_column_specs), geogcols=",\n ".join(geog_col_specs), pdb=progargs.postcodedb, pcdtab=ONSPD_TABLE_POSTCODE, rio_postcodecol=rio_postcodecol, ) create_view(engine, VIEW_ADDRESS_WITH_GEOGRAPHY, select_sql) assert_view_has_same_num_rows(engine, addresstable, VIEW_ADDRESS_WITH_GEOGRAPHY) ddhint.suppress_table(addresstable)
def process_clindocs_table(table: Table, engine: Engine, progargs: Any) -> None: # For RiO only, not RCEP crate_col_max_docver = Column(CRATE_COL_MAX_DOCVER, Integer, nullable=True) crate_col_last_doc = Column(CRATE_COL_LAST_DOC, Integer, nullable=True) table.append_column(crate_col_max_docver) table.append_column(crate_col_last_doc) add_columns(engine, table, [crate_col_max_docver, crate_col_last_doc]) add_indexes(engine, table, [ { 'index_name': CRATE_IDX_RIONUM_SERIALNUM, 'column': '{rio_number}, SerialNumber'.format( rio_number=CRATE_COL_RIO_NUMBER), }, { 'index_name': CRATE_IDX_MAX_DOCVER, 'column': CRATE_COL_MAX_DOCVER, }, { 'index_name': CRATE_IDX_LAST_DOC, 'column': CRATE_COL_LAST_DOC, }, ]) required_cols = ["SerialNumber", "RevisionID"] if not progargs.print: required_cols.extend( [CRATE_COL_MAX_DOCVER, CRATE_COL_LAST_DOC, CRATE_COL_RIO_NUMBER]) ensure_columns_present(engine, tablename=table.name, column_names=required_cols) # Find the maximum SerialNumber for each note, and store it. # Slow query, even with index. log.info("Clinical documents table {}: updating {}".format( repr(table.name), repr(CRATE_COL_MAX_DOCVER))) execute( engine, """ UPDATE p1 SET p1.{max_docver_col} = subq.max_docver FROM {tablename} p1 JOIN ( SELECT {rio_number}, SerialNumber, MAX(RevisionID) AS max_docver FROM {tablename} p2 GROUP BY {rio_number}, SerialNumber ) subq ON subq.{rio_number} = p1.{rio_number} AND subq.SerialNumber = p1.SerialNumber WHERE p1.{max_docver_col} IS NULL """.format( max_docver_col=CRATE_COL_MAX_DOCVER, tablename=table.name, rio_number=CRATE_COL_RIO_NUMBER, )) # Set a single column accordingly log.info("Clinical documents table {}: updating {}".format( repr(table.name), repr(CRATE_COL_LAST_DOC))) execute( engine, """ UPDATE {tablename} SET {last_doc_col} = CASE WHEN RevisionID = {max_docver_col} THEN 1 ELSE 0 END WHERE {last_doc_col} IS NULL """.format( tablename=table.name, last_doc_col=CRATE_COL_LAST_DOC, max_docver_col=CRATE_COL_MAX_DOCVER, ))
def process_progress_notes(table: Table, engine: Engine, progargs: Any) -> None: crate_col_max_subnum = Column(CRATE_COL_MAX_SUBNUM, Integer, nullable=True) crate_col_last_note = Column(CRATE_COL_LAST_NOTE, Integer, nullable=True) table.append_column(crate_col_max_subnum) table.append_column(crate_col_last_note) add_columns(engine, table, [crate_col_max_subnum, crate_col_last_note]) # We're always in "RiO land", not "RCEP land", for this one. add_indexes(engine, table, [ { # Joint index, for JOIN in UPDATE statement below 'index_name': CRATE_IDX_RIONUM_NOTENUM, 'column': '{rio_number}, NoteNum'.format( rio_number=CRATE_COL_RIO_NUMBER), }, { # Speeds up WHERE below. (Much, much faster for second run.) 'index_name': CRATE_IDX_MAX_SUBNUM, 'column': CRATE_COL_MAX_SUBNUM, }, { # Speeds up WHERE below. (Much, much faster for second run.) 'index_name': CRATE_IDX_LAST_NOTE, 'column': CRATE_COL_LAST_NOTE, }, ]) ensure_columns_present( engine, tablename=table.name, column_names=["NoteNum", "SubNum", "EnteredInError", "EnteredInError"]) if not progargs.print: ensure_columns_present(engine, tablename=table.name, column_names=[ CRATE_COL_MAX_SUBNUM, CRATE_COL_LAST_NOTE, CRATE_COL_RIO_NUMBER ]) # Find the maximum SubNum for each note, and store it. # Slow query, even with index. log.info("Progress notes table {}: updating {}".format( repr(table.name), repr(CRATE_COL_MAX_SUBNUM))) execute( engine, """ UPDATE p1 SET p1.{max_subnum_col} = subq.max_subnum FROM {tablename} p1 JOIN ( SELECT {rio_number}, NoteNum, MAX(SubNum) AS max_subnum FROM {tablename} p2 GROUP BY {rio_number}, NoteNum ) subq ON subq.{rio_number} = p1.{rio_number} AND subq.NoteNum = p1.NoteNum WHERE p1.{max_subnum_col} IS NULL """.format( max_subnum_col=CRATE_COL_MAX_SUBNUM, tablename=table.name, rio_number=CRATE_COL_RIO_NUMBER, )) # Set a single column accordingly log.info("Progress notes table {}: updating {}".format( repr(table.name), repr(CRATE_COL_LAST_NOTE))) execute( engine, """ UPDATE {tablename} SET {last_note_col} = CASE WHEN SubNum = {max_subnum_col} THEN 1 ELSE 0 END WHERE {last_note_col} IS NULL """.format( tablename=table.name, last_note_col=CRATE_COL_LAST_NOTE, max_subnum_col=CRATE_COL_MAX_SUBNUM, )) # Create a view, if we're on an RCEP database if progargs.rcep and progargs.cpft: select_sql = """ SELECT * FROM {tablename} WHERE (EnteredInError <> 1 OR EnteredInError IS NULL) AND {last_note_col} = 1 """.format( tablename=table.name, last_note_col=CRATE_COL_LAST_NOTE, ) create_view(engine, VIEW_RCEP_CPFT_PROGRESS_NOTES_CURRENT, select_sql)
def process_patient_table(table: Table, engine: Engine, progargs: Any) -> None: log.info("Preprocessing patient table: {}".format(repr(table.name))) rio_type = table_is_rio_type(table.name, progargs) if rio_type: pk_col = get_effective_int_pk_col(table) rio_pk = pk_col if pk_col != CRATE_COL_PK else None string_pt_id = get_rio_patient_id_col(table) required_cols = [string_pt_id] else: # RCEP type rio_pk = None required_cols = [RCEP_COL_PATIENT_ID] string_pt_id = RCEP_COL_PATIENT_ID if not progargs.print: required_cols.extend([CRATE_COL_PK, CRATE_COL_RIO_NUMBER]) # ------------------------------------------------------------------------- # Add pk and rio_number columns, if not present # ------------------------------------------------------------------------- if rio_type and rio_pk is not None: crate_pk_col = Column(CRATE_COL_PK, BigInteger, nullable=True) # ... can't do NOT NULL; need to populate it required_cols.append(rio_pk) else: # RCEP type, or no PK in RiO crate_pk_col = make_bigint_autoincrement_column( CRATE_COL_PK, engine.dialect) # ... autopopulates crate_rio_number_col = Column(CRATE_COL_RIO_NUMBER, BigInteger, nullable=True) # ... even if RiO numbers are INT, they come from VARCHAR(15) here, and # that can (aod does) look numeric and overflow an INT. # SQL Server requires Table-bound columns in order to generate DDL: table.append_column(crate_pk_col) table.append_column(crate_rio_number_col) add_columns(engine, table, [crate_pk_col, crate_rio_number_col]) # ------------------------------------------------------------------------- # Update pk and rio_number values, if not NULL # ------------------------------------------------------------------------- ensure_columns_present(engine, tablename=table.name, column_names=required_cols) cast_id_to_int = sql_fragment_cast_to_int(string_pt_id, dialect=engine.dialect) if rio_type and rio_pk: log.info("Table {}: updating columns {} and {}".format( repr(table.name), repr(CRATE_COL_PK), repr(CRATE_COL_RIO_NUMBER))) execute( engine, """ UPDATE {tablename} SET {crate_pk} = {rio_pk}, {crate_rio_number} = {cast_id_to_int} WHERE {crate_pk} IS NULL OR {crate_rio_number} IS NULL """.format( tablename=table.name, crate_pk=CRATE_COL_PK, rio_pk=rio_pk, crate_rio_number=CRATE_COL_RIO_NUMBER, cast_id_to_int=cast_id_to_int, )) else: # RCEP format, or RiO with no PK # crate_pk is autogenerated as an INT IDENTITY field log.info("Table {}: updating column {}".format( repr(table.name), repr(CRATE_COL_RIO_NUMBER))) execute( engine, """ UPDATE {tablename} SET {crate_rio_number} = {cast_id_to_int} WHERE {crate_rio_number} IS NULL """.format( # noqa tablename=table.name, crate_rio_number=CRATE_COL_RIO_NUMBER, cast_id_to_int=cast_id_to_int, )) # ------------------------------------------------------------------------- # Add indexes, if absent # ------------------------------------------------------------------------- # Note that the indexes are unlikely to speed up the WHERE NOT NULL search # above, so it doesn't matter that we add these last. Their use is for # the subsequent CRATE anonymisation table scans. add_indexes(engine, table, [ { 'index_name': CRATE_IDX_PK, 'column': CRATE_COL_PK, 'unique': True, }, { 'index_name': CRATE_IDX_RIONUM, 'column': CRATE_COL_RIO_NUMBER, }, ])