示例#1
0
def process_master_patient_table(table: Table, engine: Engine,
                                 progargs: Any) -> None:
    crate_col_nhs_number = Column(CRATE_COL_NHS_NUMBER,
                                  BigInteger,
                                  nullable=True)
    table.append_column(crate_col_nhs_number)
    add_columns(engine, table, [crate_col_nhs_number])
    if progargs.rcep:
        nhscol = RCEP_COL_NHS_NUMBER
    else:
        nhscol = RIO_COL_NHS_NUMBER
    log.info("Table {}: updating column {}".format(repr(table.name),
                                                   repr(nhscol)))
    ensure_columns_present(engine, tablename=table.name, column_names=[nhscol])
    if not progargs.print:
        ensure_columns_present(engine,
                               tablename=table.name,
                               column_names=[CRATE_COL_NHS_NUMBER])
    execute(
        engine, """
        UPDATE {tablename} SET
            {nhs_number_int} = CAST({nhscol} AS BIGINT)
            WHERE {nhs_number_int} IS NULL
    """.format(
            tablename=table.name,
            nhs_number_int=CRATE_COL_NHS_NUMBER,
            nhscol=nhscol,
        ))
示例#2
0
def process_nonpatient_table(table: Table, engine: Engine,
                             progargs: Any) -> None:
    if progargs.rcep:
        return
    log.info("Preprocessing non-patient table {}".format(repr(table.name)))
    pk_col = get_effective_int_pk_col(table)
    other_pk_col = pk_col if pk_col != CRATE_COL_PK else None
    if other_pk_col:  # table has a primary key already
        crate_pk_col = Column(CRATE_COL_PK, BigInteger, nullable=True)
    else:
        crate_pk_col = make_bigint_autoincrement_column(
            CRATE_COL_PK, engine.dialect)
    table.append_column(crate_pk_col)  # must be Table-bound, as above
    add_columns(engine, table, [crate_pk_col])
    if not progargs.print:
        ensure_columns_present(engine,
                               tablename=table.name,
                               column_names=[CRATE_COL_PK])
    if other_pk_col:
        execute(
            engine, """
            UPDATE {tablename} SET {crate_pk} = {rio_pk}
            WHERE {crate_pk} IS NULL
        """.format(tablename=table.name,
                   crate_pk=CRATE_COL_PK,
                   rio_pk=other_pk_col))
    add_indexes(engine, table, [{
        'index_name': CRATE_IDX_PK,
        'column': CRATE_COL_PK,
        'unique': True
    }])
示例#3
0
def add_geography_to_view(columns: List[str], viewmaker: ViewMaker,
                          engine: Engine,
                          configoptions: PcmisConfigOptions) -> None:
    """
    Modifies a viewmaker to add geography columns to views on PCMIS tables. For
    example, if you start with an address table including postcodes, and you're
    building a view involving it, then you can link in LSOA or IMD information
    with this function.

    Args:
        columns:
            column names from the postcode table to include
        viewmaker:
            a :class:`crate_anon.common.sql.ViewMaker`, which will be modified.
            The base table is taken from ``viewmaker.basetable``.
        engine:
            an SQLAlchemy Engine
        configoptions:
            an instance of :class:`PcmisConfigOptions`
    """
    postcode_alias_1 = "_postcodetable1"
    postcode_alias_2 = "_postcodetable2"
    prev_prefix = "previous_"
    columns_lower = [c.lower() for c in columns]
    basetable = viewmaker.basetable

    ensure_columns_present(
        engine,
        tablename=basetable,
        column_names=[PCMIS_COL_POSTCODE, PCMIS_COL_PREV_POSTCODE])

    for gc in sorted(configoptions.geogcols, key=lambda x: x.lower()):
        if gc in columns_lower:
            raise ValueError(f"Geography column {gc!r} "
                             f"clashes with an existing column")
        viewmaker.add_select(f"{postcode_alias_1}.{gc} AS {gc}")
        viewmaker.add_select(f"{postcode_alias_2}.{gc} AS {prev_prefix}{gc}")

    # PCMIS can have either 'XX99 9XX' or 'XX999XX' format:
    viewmaker.add_from(
        f"LEFT JOIN {configoptions.postcodedb}.{ONSPD_TABLE_POSTCODE} "
        f"AS {postcode_alias_1} "
        f"ON REPLACE({basetable}.{PCMIS_COL_POSTCODE}, ' ', '') = "
        f"{postcode_alias_1}.pcd_nospace")
    viewmaker.add_from(
        f"LEFT JOIN {configoptions.postcodedb}.{ONSPD_TABLE_POSTCODE} "
        f"AS {postcode_alias_2} "
        f"ON REPLACE({basetable}.{PCMIS_COL_POSTCODE}, ' ', '') = "
        f"{postcode_alias_2}.pcd_nospace")
示例#4
0
def add_geography_to_view(
        basetable: str,
        columns: List[str],
        viewmaker: ViewMaker,  # viewmaker modified
        engine: Engine,
        progargs: Any) -> None:
    postcode_alias_1 = "_postcodetable1"
    postcode_alias_2 = "_postcodetable2"
    prev_prefix = "previous_"
    columns_lower = [c.lower() for c in columns]

    ensure_columns_present(
        engine,
        tablename=basetable,
        column_names=[PCMIS_COL_POSTCODE, PCMIS_COL_PREV_POSTCODE])

    for gc in sorted(progargs.geogcols, key=lambda x: x.lower()):
        if gc in columns_lower:
            raise ValueError("Geography column {} clashes with an existing "
                             "column".format(repr(gc)))
        viewmaker.add_select("{postcode_alias_1}.{gc} AS {gc}".format(
            postcode_alias_1=postcode_alias_1, gc=gc))
        viewmaker.add_select(
            "{postcode_alias_2}.{gc} AS {prev_prefix}{gc}".format(
                postcode_alias_2=postcode_alias_2,
                gc=gc,
                prev_prefix=prev_prefix))

    # PCMIS can have either 'XX99 9XX' or 'XX999XX' format:
    viewmaker.add_from(
        "LEFT JOIN {pdb}.{pcdtab} AS {postcode_alias_1} "
        "ON REPLACE({basetable}.{PCMIS_COL_POSTCODE}, ' ', '') = "
        "{postcode_alias_1}.pcd_nospace".format(
            pdb=progargs.postcodedb,
            pcdtab=ONSPD_TABLE_POSTCODE,
            postcode_alias_1=postcode_alias_1,
            basetable=basetable,
            PCMIS_COL_POSTCODE=PCMIS_COL_POSTCODE,
        ))
    viewmaker.add_from(
        "LEFT JOIN {pdb}.{pcdtab} AS {postcode_alias_2} "
        "ON REPLACE({basetable}.{PCMIS_COL_POSTCODE}, ' ', '') = "
        "{postcode_alias_2}.pcd_nospace".format(
            pdb=progargs.postcodedb,
            pcdtab=ONSPD_TABLE_POSTCODE,
            postcode_alias_2=postcode_alias_2,
            basetable=basetable,
            PCMIS_COL_POSTCODE=PCMIS_COL_POSTCODE,
        ))
示例#5
0
def process_table(table: Table, engine: Engine,
                  configoptions: PcmisConfigOptions) -> None:
    """
    Processes a PCMIS table by checking it has appropriate columns, perhaps
    adding a CRATE integer PK, and indexing it.

    Args:
        table: an SQLAlchemy Table to process
        engine: an SQLAlchemy Engine
        configoptions: an instance of :class:`PcmisConfigOptions`
    """
    tablename = table.name
    column_names = table.columns.keys()
    log.debug(f"TABLE: {tablename}; COLUMNS: {column_names}")

    existing_pk_cols = get_pk_colnames(table)
    assert len(existing_pk_cols) < 2, (
        f"Table {tablename} has >1 PK column; don't know what to do")
    if existing_pk_cols and not get_effective_int_pk_col(table):
        raise ValueError(f"Table {table!r} has a non-integer PK")
    adding_crate_pk = not existing_pk_cols

    required_cols = [CRATE_COL_PK] if not configoptions.print_sql_only else []

    if configoptions.drop_not_create:
        # ---------------------------------------------------------------------
        # DROP STUFF! Opposite order to creation (below)
        # ---------------------------------------------------------------------
        drop_indexes(engine, table, [CRATE_IDX_PK])
        drop_columns(engine, table, [CRATE_COL_PK])
    else:
        # ---------------------------------------------------------------------
        # CREATE STUFF!
        # ---------------------------------------------------------------------
        # SQL Server requires Table-bound columns in order to generate DDL:
        if adding_crate_pk:
            crate_pk_col = make_bigint_autoincrement_column(
                CRATE_COL_PK, engine.dialect)
            table.append_column(crate_pk_col)
            add_columns(engine, table, [crate_pk_col])
        ensure_columns_present(engine,
                               tablename=table.name,
                               column_names=required_cols)
        add_indexes(engine, table, [{
            'index_name': CRATE_IDX_PK,
            'column': CRATE_COL_PK,
            'unique': True
        }])
示例#6
0
def process_table(table: Table, engine: Engine, progargs: Any) -> None:
    tablename = table.name
    column_names = table.columns.keys()
    log.debug("TABLE: {}; COLUMNS: {}".format(tablename, column_names))

    existing_pk_cols = get_pk_colnames(table)
    assert len(existing_pk_cols) < 2, (
        "Table {} has >1 PK column; don't know what to do".format(tablename))
    if existing_pk_cols and not get_effective_int_pk_col(table):
        raise ValueError("Table {} has a non-integer PK".format(repr(table)))
    adding_crate_pk = not existing_pk_cols

    required_cols = [CRATE_COL_PK] if not progargs.print else []

    if progargs.drop_danger_drop:
        # ---------------------------------------------------------------------
        # DROP STUFF! Opposite order to creation (below)
        # ---------------------------------------------------------------------
        drop_indexes(engine, table, [CRATE_IDX_PK])
        drop_columns(engine, table, [CRATE_COL_PK])
    else:
        # ---------------------------------------------------------------------
        # CREATE STUFF!
        # ---------------------------------------------------------------------
        # SQL Server requires Table-bound columns in order to generate DDL:
        if adding_crate_pk:
            crate_pk_col = make_bigint_autoincrement_column(
                CRATE_COL_PK, engine.dialect)
            table.append_column(crate_pk_col)
            add_columns(engine, table, [crate_pk_col])
        ensure_columns_present(engine,
                               tablename=table.name,
                               column_names=required_cols)
        add_indexes(engine, table, [{
            'index_name': CRATE_IDX_PK,
            'column': CRATE_COL_PK,
            'unique': True
        }])
示例#7
0
def add_postcode_geography_view(engine: Engine, progargs: Any,
                                ddhint: DDHint) -> None:  # ddhint modified
    # Re-read column names, as we may have inserted some recently by hand that
    # may not be in the initial metadata.
    if progargs.rio:
        addresstable = RIO_TABLE_ADDRESS
        rio_postcodecol = RIO_COL_POSTCODE
    else:
        addresstable = RCEP_TABLE_ADDRESS
        rio_postcodecol = RCEP_COL_POSTCODE
    orig_column_names = get_column_names(engine,
                                         tablename=addresstable,
                                         sort=True)

    # Remove any original column names being overridden by new ones.
    # (Could also do this the other way around!)
    geogcols_lowercase = [x.lower() for x in progargs.geogcols]
    orig_column_names = [
        x for x in orig_column_names if x.lower() not in geogcols_lowercase
    ]

    orig_column_specs = [
        "{t}.{c}".format(t=addresstable, c=col) for col in orig_column_names
    ]
    geog_col_specs = [
        "{db}.{t}.{c}".format(db=progargs.postcodedb,
                              t=ONSPD_TABLE_POSTCODE,
                              c=col)
        for col in sorted(progargs.geogcols, key=lambda x: x.lower())
    ]
    overlap = set(orig_column_names) & set(progargs.geogcols)
    if overlap:
        raise ValueError("Columns overlap: address table contains columns {}; "
                         "geogcols = {}; overlap = {}".format(
                             orig_column_names, progargs.geogcols, overlap))
    ensure_columns_present(engine,
                           tablename=addresstable,
                           column_names=[rio_postcodecol])
    select_sql = """
        SELECT {origcols},
            {geogcols}
        FROM {addresstable}
        LEFT JOIN {pdb}.{pcdtab}
        ON {addresstable}.{rio_postcodecol} = {pdb}.{pcdtab}.pcds
        -- RCEP, and presumably RiO, appear to use the ONS pcds format, of
        -- 2-4 char outward code; space; 3-char inward code.
        -- If this fails, use this slower version:
        -- ON REPLACE({addresstable}.{rio_postcodecol},
        --            ' ',
        --            '') = {pdb}.{pcdtab}.pcd_nospace
    """.format(
        addresstable=addresstable,
        origcols=",\n            ".join(orig_column_specs),
        geogcols=",\n            ".join(geog_col_specs),
        pdb=progargs.postcodedb,
        pcdtab=ONSPD_TABLE_POSTCODE,
        rio_postcodecol=rio_postcodecol,
    )
    create_view(engine, VIEW_ADDRESS_WITH_GEOGRAPHY, select_sql)
    assert_view_has_same_num_rows(engine, addresstable,
                                  VIEW_ADDRESS_WITH_GEOGRAPHY)
    ddhint.suppress_table(addresstable)
示例#8
0
def process_clindocs_table(table: Table, engine: Engine,
                           progargs: Any) -> None:
    # For RiO only, not RCEP
    crate_col_max_docver = Column(CRATE_COL_MAX_DOCVER, Integer, nullable=True)
    crate_col_last_doc = Column(CRATE_COL_LAST_DOC, Integer, nullable=True)
    table.append_column(crate_col_max_docver)
    table.append_column(crate_col_last_doc)
    add_columns(engine, table, [crate_col_max_docver, crate_col_last_doc])
    add_indexes(engine, table, [
        {
            'index_name':
            CRATE_IDX_RIONUM_SERIALNUM,
            'column':
            '{rio_number}, SerialNumber'.format(
                rio_number=CRATE_COL_RIO_NUMBER),
        },
        {
            'index_name': CRATE_IDX_MAX_DOCVER,
            'column': CRATE_COL_MAX_DOCVER,
        },
        {
            'index_name': CRATE_IDX_LAST_DOC,
            'column': CRATE_COL_LAST_DOC,
        },
    ])

    required_cols = ["SerialNumber", "RevisionID"]
    if not progargs.print:
        required_cols.extend(
            [CRATE_COL_MAX_DOCVER, CRATE_COL_LAST_DOC, CRATE_COL_RIO_NUMBER])
    ensure_columns_present(engine,
                           tablename=table.name,
                           column_names=required_cols)

    # Find the maximum SerialNumber for each note, and store it.
    # Slow query, even with index.
    log.info("Clinical documents table {}: updating {}".format(
        repr(table.name), repr(CRATE_COL_MAX_DOCVER)))
    execute(
        engine, """
        UPDATE p1
        SET p1.{max_docver_col} = subq.max_docver
        FROM {tablename} p1 JOIN (
            SELECT {rio_number}, SerialNumber, MAX(RevisionID) AS max_docver
            FROM {tablename} p2
            GROUP BY {rio_number}, SerialNumber
        ) subq
        ON subq.{rio_number} = p1.{rio_number}
        AND subq.SerialNumber = p1.SerialNumber
        WHERE p1.{max_docver_col} IS NULL
    """.format(
            max_docver_col=CRATE_COL_MAX_DOCVER,
            tablename=table.name,
            rio_number=CRATE_COL_RIO_NUMBER,
        ))

    # Set a single column accordingly
    log.info("Clinical documents table {}: updating {}".format(
        repr(table.name), repr(CRATE_COL_LAST_DOC)))
    execute(
        engine, """
        UPDATE {tablename} SET
            {last_doc_col} =
                CASE
                    WHEN RevisionID = {max_docver_col} THEN 1
                    ELSE 0
                END
        WHERE {last_doc_col} IS NULL
    """.format(
            tablename=table.name,
            last_doc_col=CRATE_COL_LAST_DOC,
            max_docver_col=CRATE_COL_MAX_DOCVER,
        ))
示例#9
0
def process_progress_notes(table: Table, engine: Engine,
                           progargs: Any) -> None:
    crate_col_max_subnum = Column(CRATE_COL_MAX_SUBNUM, Integer, nullable=True)
    crate_col_last_note = Column(CRATE_COL_LAST_NOTE, Integer, nullable=True)
    table.append_column(crate_col_max_subnum)
    table.append_column(crate_col_last_note)
    add_columns(engine, table, [crate_col_max_subnum, crate_col_last_note])
    # We're always in "RiO land", not "RCEP land", for this one.
    add_indexes(engine, table, [
        {  # Joint index, for JOIN in UPDATE statement below
            'index_name': CRATE_IDX_RIONUM_NOTENUM,
            'column': '{rio_number}, NoteNum'.format(
                rio_number=CRATE_COL_RIO_NUMBER),
        },
        {  # Speeds up WHERE below. (Much, much faster for second run.)
            'index_name': CRATE_IDX_MAX_SUBNUM,
            'column': CRATE_COL_MAX_SUBNUM,
        },
        {  # Speeds up WHERE below. (Much, much faster for second run.)
            'index_name': CRATE_IDX_LAST_NOTE,
            'column': CRATE_COL_LAST_NOTE,
        },
    ])

    ensure_columns_present(
        engine,
        tablename=table.name,
        column_names=["NoteNum", "SubNum", "EnteredInError", "EnteredInError"])
    if not progargs.print:
        ensure_columns_present(engine,
                               tablename=table.name,
                               column_names=[
                                   CRATE_COL_MAX_SUBNUM, CRATE_COL_LAST_NOTE,
                                   CRATE_COL_RIO_NUMBER
                               ])

    # Find the maximum SubNum for each note, and store it.
    # Slow query, even with index.
    log.info("Progress notes table {}: updating {}".format(
        repr(table.name), repr(CRATE_COL_MAX_SUBNUM)))
    execute(
        engine, """
        UPDATE p1
        SET p1.{max_subnum_col} = subq.max_subnum
        FROM {tablename} p1 JOIN (
            SELECT {rio_number}, NoteNum, MAX(SubNum) AS max_subnum
            FROM {tablename} p2
            GROUP BY {rio_number}, NoteNum
        ) subq
        ON subq.{rio_number} = p1.{rio_number}
        AND subq.NoteNum = p1.NoteNum
        WHERE p1.{max_subnum_col} IS NULL
    """.format(
            max_subnum_col=CRATE_COL_MAX_SUBNUM,
            tablename=table.name,
            rio_number=CRATE_COL_RIO_NUMBER,
        ))

    # Set a single column accordingly
    log.info("Progress notes table {}: updating {}".format(
        repr(table.name), repr(CRATE_COL_LAST_NOTE)))
    execute(
        engine, """
        UPDATE {tablename} SET
            {last_note_col} =
                CASE
                    WHEN SubNum = {max_subnum_col} THEN 1
                    ELSE 0
                END
        WHERE {last_note_col} IS NULL
    """.format(
            tablename=table.name,
            last_note_col=CRATE_COL_LAST_NOTE,
            max_subnum_col=CRATE_COL_MAX_SUBNUM,
        ))

    # Create a view, if we're on an RCEP database
    if progargs.rcep and progargs.cpft:
        select_sql = """
            SELECT *
            FROM {tablename}
            WHERE
                (EnteredInError <> 1 OR EnteredInError IS NULL)
                AND {last_note_col} = 1
        """.format(
            tablename=table.name,
            last_note_col=CRATE_COL_LAST_NOTE,
        )
        create_view(engine, VIEW_RCEP_CPFT_PROGRESS_NOTES_CURRENT, select_sql)
示例#10
0
def process_patient_table(table: Table, engine: Engine, progargs: Any) -> None:
    log.info("Preprocessing patient table: {}".format(repr(table.name)))
    rio_type = table_is_rio_type(table.name, progargs)
    if rio_type:
        pk_col = get_effective_int_pk_col(table)
        rio_pk = pk_col if pk_col != CRATE_COL_PK else None
        string_pt_id = get_rio_patient_id_col(table)
        required_cols = [string_pt_id]
    else:  # RCEP type
        rio_pk = None
        required_cols = [RCEP_COL_PATIENT_ID]
        string_pt_id = RCEP_COL_PATIENT_ID
    if not progargs.print:
        required_cols.extend([CRATE_COL_PK, CRATE_COL_RIO_NUMBER])

    # -------------------------------------------------------------------------
    # Add pk and rio_number columns, if not present
    # -------------------------------------------------------------------------
    if rio_type and rio_pk is not None:
        crate_pk_col = Column(CRATE_COL_PK, BigInteger, nullable=True)
        # ... can't do NOT NULL; need to populate it
        required_cols.append(rio_pk)
    else:  # RCEP type, or no PK in RiO
        crate_pk_col = make_bigint_autoincrement_column(
            CRATE_COL_PK, engine.dialect)
        # ... autopopulates
    crate_rio_number_col = Column(CRATE_COL_RIO_NUMBER,
                                  BigInteger,
                                  nullable=True)
    # ... even if RiO numbers are INT, they come from VARCHAR(15) here, and
    # that can (aod does) look numeric and overflow an INT.
    # SQL Server requires Table-bound columns in order to generate DDL:
    table.append_column(crate_pk_col)
    table.append_column(crate_rio_number_col)
    add_columns(engine, table, [crate_pk_col, crate_rio_number_col])

    # -------------------------------------------------------------------------
    # Update pk and rio_number values, if not NULL
    # -------------------------------------------------------------------------
    ensure_columns_present(engine,
                           tablename=table.name,
                           column_names=required_cols)
    cast_id_to_int = sql_fragment_cast_to_int(string_pt_id,
                                              dialect=engine.dialect)
    if rio_type and rio_pk:
        log.info("Table {}: updating columns {} and {}".format(
            repr(table.name), repr(CRATE_COL_PK), repr(CRATE_COL_RIO_NUMBER)))
        execute(
            engine, """
            UPDATE {tablename} SET
                {crate_pk} = {rio_pk},
                {crate_rio_number} = {cast_id_to_int}
            WHERE
                {crate_pk} IS NULL
                OR {crate_rio_number} IS NULL
        """.format(
                tablename=table.name,
                crate_pk=CRATE_COL_PK,
                rio_pk=rio_pk,
                crate_rio_number=CRATE_COL_RIO_NUMBER,
                cast_id_to_int=cast_id_to_int,
            ))
    else:
        # RCEP format, or RiO with no PK
        # crate_pk is autogenerated as an INT IDENTITY field
        log.info("Table {}: updating column {}".format(
            repr(table.name), repr(CRATE_COL_RIO_NUMBER)))
        execute(
            engine,
            """
            UPDATE {tablename} SET
                {crate_rio_number} = {cast_id_to_int}
            WHERE
                {crate_rio_number} IS NULL
        """.format(  # noqa
                tablename=table.name,
                crate_rio_number=CRATE_COL_RIO_NUMBER,
                cast_id_to_int=cast_id_to_int,
            ))
    # -------------------------------------------------------------------------
    # Add indexes, if absent
    # -------------------------------------------------------------------------
    # Note that the indexes are unlikely to speed up the WHERE NOT NULL search
    # above, so it doesn't matter that we add these last. Their use is for
    # the subsequent CRATE anonymisation table scans.
    add_indexes(engine, table, [
        {
            'index_name': CRATE_IDX_PK,
            'column': CRATE_COL_PK,
            'unique': True,
        },
        {
            'index_name': CRATE_IDX_RIONUM,
            'column': CRATE_COL_RIO_NUMBER,
        },
    ])