示例#1
0
def refresh_materialized_view(schema_name, view_name, db: DatabaseSession):
    """
    Refresh materialized view <schema>.<view> in ID3C.
    """

    LOG.info(f"Refreshing materialized view «{schema_name}.{view_name}»")

    db.cursor().execute(
        """
        select refresh_materialized_view(%s, %s)
    """, (schema_name, view_name))

    LOG.info("Successfully refreshed materialized view")
示例#2
0
def ls():
    """List users."""
    session = DatabaseSession()

    with session.cursor() as cursor:
        cursor.execute("""
            select usename as name,
                   pg_catalog.shobj_description(usesysid, 'pg_authid') as description,
                   coalesce(array_agg(groname order by groname) filter (where groname is not null), '{}') as roles
              from pg_catalog.pg_user
              left join pg_catalog.pg_group on (grolist @> array[usesysid])
             where usename not in ('postgres', 'rdsadmin')
             group by name, usesysid
             order by name
            """)

        users = list(cursor)

    # Line up name + description nicely into a column
    def maxlen(attr):
        return max(map(len, filter(None, map(attrgetter(attr), users))),
                   default=0)

    template = "{:<%d}" % (maxlen("name") + 3)

    for user in users:
        click.secho(template.format(user.name), bold=True, nl=False)
        click.echo(", ".join(user.roles))
def insert_dets(db: DatabaseSession, project: Project, offers: List[dict]):
    """
    Inserts synthethic DETs into ``receiving.redcap_det`` for the REDCap record
    *offers* made for *project*.
    """
    dets = [(Json(det(project, offer, TESTING_INSTRUMENT)), )
            for offer in offers]

    LOG.info(f"Inserting {len(dets):,} synthetic REDCap DETs for {project}")

    with db.cursor() as cursor:
        execute_values(
            cursor, """
            insert into receiving.redcap_det (document) values %s
            """, dets)
示例#4
0
文件: fhir.py 项目: seattleflu/id3c
def etl_fhir(*, db: DatabaseSession):
    LOG.debug(f"Starting the FHIR ETL routine, revision {REVISION}")

    # Fetch and iterate over FHIR documents that aren't processed
    #
    # Use a server-side cursor by providing a name and limit to one fetched
    # record at a time, to limit local process size.
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same FHIR documents.
    LOG.debug("Fetching unprocessed FHIR documents")

    fhir_documents = db.cursor("fhir")
    fhir_documents.itersize = 1
    fhir_documents.execute(
        """
        select fhir_id as id, document
          from receiving.fhir
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "etl": ETL_NAME,
            "revision": REVISION
        }]), ))

    for record in fhir_documents:
        with db.savepoint(f"FHIR document {record.id}"):
            LOG.info(f"Processing FHIR document {record.id}")

            assert_bundle_collection(record.document)
            bundle = Bundle(record.document)
            resources = extract_resources(bundle)

            # Loop over every Resource the Bundle entry, processing what is
            # needed along the way.
            try:
                assert_required_resource_types_present(resources)
                process_bundle_entries(db, bundle)

            except SkipBundleError as error:
                LOG.warning(
                    f"Skipping bundle in FHIR document «{record.id}»: {error}")
                mark_skipped(db, record.id)
                continue

            mark_processed(db, record.id, {"status": "processed"})
            LOG.info(f"Finished processing FHIR document {record.id}")
示例#5
0
def upsert_encounter_location(db: DatabaseSession, encounter_id: int,
                              relation: str, location_id: int) -> Any:
    """
    Upserts an encounter location by its *encounter_id* and *relation*.
    """
    LOG.debug(f"Upserting encounter {relation} location")

    with db.cursor() as cursor:
        cursor.execute(
            """
            insert into warehouse.encounter_location (encounter_id, relation, location_id)
                values (%s, %s, %s)
                on conflict (encounter_id, relation) do update
                    set location_id = excluded.location_id
            """, (encounter_id, relation, location_id))

        assert cursor.rowcount == 1, "Upsert affected no rows!"
示例#6
0
def mark_processed(db: DatabaseSession, det_id: int, entry = {}) -> None:
    LOG.debug(f"Appending to processing log of REDCap DET record {det_id}")

    data = {
        "det_id": det_id,
        "log_entry": Json({
            **entry,
            "timestamp": datetime.now(timezone.utc),
        }),
    }

    with db.cursor() as cursor:
        cursor.execute("""
            update receiving.redcap_det
               set processing_log = processing_log || %(log_entry)s
             where redcap_det_id = %(det_id)s
            """, data)
示例#7
0
def set_ls():
    """List identifier sets."""
    session = DatabaseSession()

    with session.cursor() as cursor:
        cursor.execute("""
            select name, description
              from warehouse.identifier_set
             order by lower(name)
            """)

        sets = list(cursor)

    # Line up names nicely into a column
    template = "{:<%d}" % (max(len(s.name) for s in sets) + 3)

    for set in sets:
        click.secho(template.format(set.name), bold = True, nl = False)
        click.echo(set.description)
示例#8
0
def labels(filename, layout: str='default'):
    """
    Make barcode labels for an existing batch of identifiers.

    A PDF of printable barcode labels is generated using the Lab Labels¹
    instance <https://backoffice.seattleflu.org/labels/>.  An alternative
    instance may be used by setting the LABEL_API environment variable to the
    instance URL.

    The batch of identifiers to make labels for is selected interactively based
    on the identifier set and time of original generation.

    If --layout is requested, the printable barcode labels will use the given
    version of the layout, if available.

    ¹ https://github.com/MullinsLab/Lab-Labels
    """
    session = DatabaseSession()

    # Fetch batches of identifiers
    with session.cursor() as cursor:
        cursor.execute("""
            select identifier_set.name as set_name,
                   to_char(generated, 'FMDD Mon YYYY') as generated_date,
                   generated,
                   count(*)
              from warehouse.identifier
              join warehouse.identifier_set using (identifier_set_id)
             group by generated, identifier_set.name
             order by generated, identifier_set.name
            """)

        batches = list(cursor)

    # Print batches for selection
    click.echo("\nThe following batches of identifiers exist:\n")

    for index, batch in enumerate(batches, 1):
        click.secho(f"{index:2d}) ", bold = True, nl = False)
        click.echo(f"{batch.generated_date:>11} — {batch.count:>5,} {batch.set_name}")

    # Which batch do we want?
    choice = click.prompt(
        "\nFor which batch would you like to make labels",
        prompt_suffix = "? ",
        type = click.IntRange(1, len(batches)),
        default = str(len(batches)))

    chosen_batch = batches[int(choice) - 1]

    # Fetch identifiers for the chosen batch
    with session.cursor() as cursor:
        cursor.execute("""
            select uuid, barcode
              from warehouse.identifier
              join warehouse.identifier_set using (identifier_set_id)
             where identifier_set.name = %s
               and generated = %s
            """, (chosen_batch.set_name, chosen_batch.generated))

        identifiers = list(cursor)

    assert len(identifiers) == chosen_batch.count

    label_layout = labelmaker.layout_identifiers(chosen_batch.set_name, identifiers, layout)
    pdf = labelmaker.generate_pdf(label_layout)
    filename.write(pdf)
def etl_longitudinal(*, db: DatabaseSession):
    LOG.debug(f"Starting the longitudinal ETL routine, revision {REVISION}")

    # Fetch and iterate over longitudinal records that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same longitudinal records.
    LOG.debug("Fetching unprocessed longitudinal records")

    longitudinal = db.cursor("longitudinal")
    longitudinal.execute(
        """
        select longitudinal_id as id, document
          from receiving.longitudinal
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "revision": REVISION
        }]), ))

    for record in longitudinal:
        with db.savepoint(f"longitudinal record {record.id}"):
            LOG.info(f"Processing longitudinal record {record.id}")

            # Check validity of barcode
            received_sample_identifier = sample_identifier(db, record.document)

            # Check sample exists in database
            if received_sample_identifier:
                sample = find_sample(db, identifier=received_sample_identifier)
            else:
                sample = None

            # Most of the time we expect to see existing sites so a
            # select-first approach makes the most sense to avoid useless
            # updates.
            site = find_or_create_site(
                db,
                identifier=site_identifier(record.document),
                details={"type": record.document['type']})

            # Most of the time we expect to see existing individuals and new
            # encounters.
            # Encounters we see more than once are presumed to be
            # corrections.
            individual = find_or_create_individual(
                db,
                identifier=record.document["individual"],
                sex=sex(record.document))

            encounter = upsert_encounter(
                db,
                identifier=record.document["identifier"],
                encountered=record.document["encountered"],
                individual_id=individual.id,
                site_id=site.id,
                age=age(record.document),
                details=encounter_details(record.document))

            if sample:
                sample = update_sample(db,
                                       sample=sample,
                                       encounter_id=encounter.id)

            # Link encounter to a Census tract, if we have it
            tract_identifier = record.document.get("census_tract")

            if tract_identifier:
                tract = find_location(db, "tract", str(tract_identifier))
                assert tract, f"Tract «{tract_identifier}» is unknown"

                upsert_encounter_location(db,
                                          encounter_id=encounter.id,
                                          relation="residence",
                                          location_id=tract.id)

            mark_processed(db, record.id, {"status": "processed"})

            LOG.info(f"Finished processing longitudinal record {record.id}")
示例#10
0
def etl_manifest(*, db: DatabaseSession):
    LOG.debug(f"Starting the manifest ETL routine, revision {REVISION}")

    # XXX TODO: Stop hardcoding valid identifier sets.  Instead, accept them as
    # an option or config (and validate option choices against what's actually
    # in the database).  We won't want to validate using click.option(),
    # because that would necessitate a database connection simply to run
    # bin/id3c at all.
    #   -trs, 13 May 2019
    expected_identifier_sets = {
        "samples": {"samples"},
        "collections": {
            "collections-environmental",
            "collections-fluathome.org",
            "collections-household-intervention",
            "collections-household-intervention-asymptomatic",
            "collections-household-observation",
            "collections-household-observation-asymptomatic",
            "collections-kiosks",
            "collections-kiosks-asymptomatic",
            "collections-seattleflu.org",
            "collections-swab&send",
            "collections-swab&send-asymptomatic",
            "collections-self-test",
            "collections-scan",
            "collections-haarvi",
            "samples-haarvi",
        },
        "rdt": {"collections-fluathome.org"}
    }

    # Fetch and iterate over samples that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same samples.
    LOG.debug("Fetching unprocessed manifest records")

    manifest = db.cursor("manifest")
    manifest.execute("""
        select manifest_id as id, document
          from receiving.manifest
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{ "etl": ETL_NAME, "revision": REVISION }]),))

    for manifest_record in manifest:
        with db.savepoint(f"manifest record {manifest_record.id}"):
            LOG.info(f"Processing record {manifest_record.id}")

            # Convert sample barcode to full identifier, ensuring it's
            # known and from the correct identifier set.
            sample_barcode = manifest_record.document.pop("sample")
            sample_identifier = find_identifier(db, sample_barcode)

            if not sample_identifier:
                LOG.warning(f"Skipping sample with unknown sample barcode «{sample_barcode}»")
                mark_skipped(db, manifest_record.id)
                continue

            if (manifest_record.document.get("sample_type") and
                manifest_record.document["sample_type"] == "rdt"):
                assert sample_identifier.set_name in expected_identifier_sets["rdt"], \
                    (f"Sample identifier found in set «{sample_identifier.set_name}», " +
                    f"not {expected_identifier_sets['rdt']}")
            else:
                assert sample_identifier.set_name in expected_identifier_sets["samples"], \
                    (f"Sample identifier found in set «{sample_identifier.set_name}», " +
                    f"not {expected_identifier_sets['samples']}")

            # Optionally, convert the collection barcode to full
            # identifier, ensuring it's known and from the correct
            # identifier set.
            collection_barcode = manifest_record.document.pop("collection", None)
            collection_identifier = find_identifier(db, collection_barcode) if collection_barcode else None

            if collection_barcode and not collection_identifier:
                LOG.warning(f"Skipping sample with unknown collection barcode «{collection_barcode}»")
                mark_skipped(db, manifest_record.id)
                continue

            assert not collection_identifier \
                or collection_identifier.set_name in expected_identifier_sets["collections"], \
                    f"Collection identifier found in set «{collection_identifier.set_name}», not {expected_identifier_sets['collections']}" # type: ignore

            # Sample collection date
            collection_date = manifest_record.document.get("date")

            # Upsert sample cooperatively with enrollments ETL routine
            #
            # The details document was intentionally modified by two pop()s
            # earlier to remove barcodes that were looked up.  The
            # rationale is that we want just one clear place in the
            # warehouse for each piece of information.
            sample, status = upsert_sample(db,
                identifier            = sample_identifier.uuid,
                collection_identifier = collection_identifier.uuid if collection_identifier else None,
                collection_date       = collection_date,
                additional_details    = manifest_record.document)

            mark_loaded(db, manifest_record.id,
                status = status,
                sample_id = sample.id)

            LOG.info(f"Finished processing manifest record {manifest_record.id}")
示例#11
0
def upsert_sample(db: DatabaseSession,
                  identifier: str,
                  collection_identifier: Optional[str],
                  collection_date: Optional[str],
                  additional_details: dict) -> Tuple[Any, str]:
    """
    Upsert sample by its *identifier* and/or *collection_identifier*.

    An existing sample has its *identifier*, *collection_identifier*,
    *collection_date* updated, and the provided *additional_details* are
    merged (at the top-level only) into the existing sample details, if any.

    Raises an exception if there is more than one matching sample.
    """
    data = {
        "identifier": identifier,
        "collection_identifier": collection_identifier,
        "collection_date": collection_date,
        "additional_details": Json(additional_details),
    }

    # Look for existing sample(s)
    with db.cursor() as cursor:
        cursor.execute("""
            select sample_id as id, identifier, collection_identifier, encounter_id
              from warehouse.sample
             where identifier = %(identifier)s
                or collection_identifier = %(collection_identifier)s
               for update
            """, data)

        samples = list(cursor)

    # Nothing found → create
    if not samples:
        LOG.info("Creating new sample")
        status = 'created'
        sample = db.fetch_row("""
            insert into warehouse.sample (identifier, collection_identifier, collected, details)
                values (%(identifier)s,
                        %(collection_identifier)s,
                        date_or_null(%(collection_date)s),
                        %(additional_details)s)
            returning sample_id as id, identifier, collection_identifier, encounter_id
            """, data)

    # One found → update
    elif len(samples) == 1:
        status = 'updated'
        sample = samples[0]

        LOG.info(f"Updating existing sample {sample.id}")
        sample = db.fetch_row("""
            update warehouse.sample
               set identifier = %(identifier)s,
                   collection_identifier = %(collection_identifier)s,
                   collected = date_or_null(%(collection_date)s),
                   details = coalesce(details, '{}') || %(additional_details)s

             where sample_id = %(sample_id)s

            returning sample_id as id, identifier, collection_identifier, encounter_id
            """,
            { **data, "sample_id": sample.id })

        assert sample.id, "Update affected no rows!"

    # More than one found → error
    else:
        raise Exception(f"More than one sample matching sample and/or collection barcodes: {samples}")

    return sample, status
示例#12
0
文件: kit.py 项目: sonali-mhihim/id3c
def kit_enrollments(*, db: DatabaseSession):
    LOG.debug(
        f"Starting the kit enrollments ETL routine, revision {ENROLLMENTS_REVISION}"
    )

    expected_barcode_types = {"ScannedSelfSwab", "ManualSelfSwab"}

    LOG.debug("Fetching unprocessed enrollments")
    enrollments = db.cursor("enrollments")
    enrollments.execute(
        """
        select enrollment_id as id, document
          from receiving.enrollment
         where not processing_log @> %s
         order by id
          for update
        """, (Json([{
            "etl": ETL_NAME,
            "revision": ENROLLMENTS_REVISION
        }]), ))

    for enrollment in enrollments:
        with db.savepoint(f"enrollment {enrollment.id}"):
            LOG.info(f"Processing enrollment {enrollment.id}")

            # Find encounter that should have been created
            # from this enrollment record through etl enrollments
            encounter = find_encounter(db, enrollment.document["id"])

            # Error out the kit etl process if no encounter found
            # The kit etl process can try again starting with this record
            # next time with the idea that the encounter will be
            # created by then.
            if not encounter:
                raise EncounterNotFoundError(
                    f"No encounter with identifier «{enrollment.document['id']}» found"
                )

            # Skip and mark the enrollment document as processed if the
            # encounter found is linked to a site that is not self-test
            if encounter.site != "self-test":
                LOG.debug(
                    f"Found encounter {encounter.id} «{encounter.identifier}»"
                    + f"linked to site «{encounter.site}», not 'self-test'")
                mark_enrollment_processed(db, enrollment.id)
                continue

            for code in enrollment.document["sampleCodes"]:
                barcode = code.get("code")

                # Kit must have a barcode
                if not barcode:
                    LOG.warning(f"No barcode found in sampleCodes {code}")
                    continue

                # Barcode must be of expected barcode type
                if code["type"] not in expected_barcode_types:
                    LOG.debug(f"Skipping barcode with type {code['type']}")
                    continue

                # Convert kit barcode to full identifier
                kit_identifier = find_identifier(db, barcode)

                if not kit_identifier:
                    LOG.warning(
                        f"Skipping kit with unknown barcode «{barcode}»")
                    continue

                if kit_identifier.set_name not in expected_identifier_sets[
                        "kits"]:
                    LOG.warning(
                        f"Skipping kit with identifier found in " +
                        f"set «{kit_identifier.set_name}» not {expected_identifier_sets['kits']}"
                    )
                    continue

                details = {"type": code["type"]}

                kit, status = upsert_kit_with_encounter(
                    db,
                    identifier=kit_identifier.uuid,
                    encounter_id=encounter.id,
                    additional_details=details)

                if status == "updated":
                    update_kit_samples(db, kit)

            mark_enrollment_processed(db, enrollment.id)

            LOG.info(f"Finished processing enrollment {enrollment.id}")
示例#13
0
文件: kit.py 项目: sonali-mhihim/id3c
def kit_manifests(*, db: DatabaseSession):
    LOG.debug(
        f"Starting the kits manifests ETL routine, revision {MANIFEST_REVISION}"
    )

    LOG.debug("Fetching unprocessed manifest records")

    manifest = db.cursor("manifest")
    manifest.execute(
        """
        select manifest_id as id, document
          from receiving.manifest
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "etl": ETL_NAME,
            "revision": MANIFEST_REVISION
        }]), ))

    for manifest_record in manifest:
        with db.savepoint(f"manifest record {manifest_record.id}"):
            LOG.info(f"Processing record {manifest_record.id}")

            # Mark record as skipped
            # if it does not contain a kit related sample
            if "kit" not in manifest_record.document:
                LOG.info(
                    f"Skipping manifest record {manifest_record.id} without kit data"
                )
                mark_skipped(db, manifest_record.id)
                continue

            sample_barcode = manifest_record.document.pop("sample")
            sample_identifier = find_identifier(db, sample_barcode)

            # Mark record as skipped
            # if it has an unknown sample barcode
            if not sample_identifier:
                LOG.warning(
                    f"Skipping manifest record with unknown sample barcode «{sample_barcode}»"
                )
                mark_skipped(db, manifest_record.id)
                continue

            # Mark record as skipped sample identifier set is unexpected
            if sample_identifier.set_name not in expected_identifier_sets[
                    "samples"]:
                LOG.warning(
                    f"Skipping manifest record with sample identifier found in "
                    +
                    f"set «{sample_identifier.set_name}», not {expected_identifier_sets['samples']}"
                )
                mark_skipped(db, manifest_record.id)
                continue

            # Find sample that should have been created from this
            # manifest record via etl manifest
            sample = find_sample(db, sample_identifier.uuid)

            # Error out the kit etl process if no sample found
            # The kit etl process can try again starting with this record
            # next time with the idea that the sample will be
            # created by then.
            if not sample:
                raise SampleNotFoundError(
                    f"No sample with «{sample_identifier.uuid}» found")

            # Mark record as skipped if the sample does not have a
            # sample type (utm or rdt)
            if sample.type not in {"utm", "rdt"}:
                LOG.info(f"Skipping manifest record {manifest_record.id} " +
                         f"with unknown sample type {sample.type}")
                mark_skipped(db, manifest_record.id)
                continue

            kit_barcode = manifest_record.document.pop("kit")
            kit_identifier = find_identifier(db, kit_barcode)

            # Mark record as skipped if it has an unknown kit barcode
            if not kit_identifier:
                LOG.warning(
                    f"Skipping kit with unknown barcode «{kit_barcode}»")
                mark_skipped(db, manifest_record.id)
                continue

            # Mark record as skipped if kit identifier set is unexpected
            if kit_identifier.set_name not in expected_identifier_sets["kits"]:
                LOG.warning(
                    f"Skipping kit with identifier found in " +
                    f"set «{kit_identifier.set_name}» not {expected_identifier_sets['kits']}"
                )
                mark_skipped(db, manifest_record.id)
                continue

            # List of extra data not needed for kit record that can
            # be removed before adding manifest document to kit details
            extra_data = [
                "collection", "sample_type", "aliquot_date", "aliquots",
                "racks"
            ]
            for key in extra_data:
                manifest_record.document.pop(key, None)

            # Try to find identifier for the test-strip barcode for rdt samples
            if sample.type == "rdt":
                update_test_strip(db, manifest_record.document)

            kit, status = upsert_kit_with_sample(
                db,
                identifier=kit_identifier.uuid,
                sample=sample,
                additional_details=manifest_record.document)

            if status == "updated":
                update_sample(db, sample, kit.encounter_id)

            mark_loaded(db, manifest_record.id, status, kit.id)
示例#14
0
def etl_presence_absence(*, db: DatabaseSession):
    LOG.debug(
        f"Starting the presence_absence ETL routine, revision {REVISION}")

    # Fetch and iterate over presence-absence tests that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same presence-absence tests.
    LOG.debug("Fetching unprocessed presence-absence tests")

    presence_absence = db.cursor("presence_absence")
    presence_absence.itersize = 1
    presence_absence.execute(
        """
        select presence_absence_id as id, document,
               received::date as received_date
          from receiving.presence_absence
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "revision": REVISION
        }]), ))

    for group in presence_absence:
        with db.savepoint(f"presence_absence group {group.id}"):
            LOG.info(f"Processing presence_absence group {group.id}")

            # Samplify will now send documents with a top level key
            # "samples". The new format also includes a "chip" key for each
            # sample which is then included in the unique identifier for
            # each presence/absence result
            #   -Jover, 14 Nov 2019
            try:
                received_samples = group.document["samples"]
            except KeyError as error:
                # Skip documents in the old format because they do not
                # include the "chip" key which is needed for the
                # unique identifier for each result.
                #   -Jover, 14 Nov 2019
                # Also skip old format to avoid ingesting wrong data from
                # plate swapped data! This will lead to 188 samples with the
                # wrong nwgc_id associated with them.
                #   -Jover, 06 Dec 2019
                if (group.document.get("store") is not None
                        or group.document.get("Update") is not None):

                    LOG.info(
                        "Skipping presence_absence record that is in old format"
                    )
                    mark_processed(db, group.id)
                    continue

                else:
                    raise error from None

            for received_sample in received_samples:
                received_sample_barcode = received_sample.get("investigatorId")
                if not received_sample_barcode:
                    LOG.info(
                        f"Skipping sample «{received_sample['sampleId']}» without SFS barcode"
                    )
                    continue

                # Don't go any further if the sample is marked as Failed
                sample_failed = received_sample.get("sampleFailed")
                if sample_failed is True:
                    LOG.info(
                        f"Skipping sample «{received_sample_barcode}» that has been failed"
                    )
                    continue

                # Don't go any further if there are no results to import.
                test_results = received_sample["targetResults"]

                if not test_results:
                    LOG.warning(
                        f"Skipping sample «{received_sample_barcode}» without any results"
                    )
                    continue

                received_sample_id = str(received_sample["sampleId"])
                chip = received_sample.get("chip")
                extraction_date = received_sample.get("extractionDate")
                assay_name = received_sample.get("assayName")
                assay_date = received_sample.get("assayDate")
                # The assayType field will be removed after Samplify starts
                # sending us OpenArray results with target.clinicalStatus.
                #
                # kfay, 28 Dec 2020
                assay_type = received_sample.get("assayType")

                # Guard against empty chip values
                assert chip or "chip" not in received_sample, "Received bogus chip id"

                # Must be current results
                LOG.info(f"Processing sample «{received_sample_barcode}»")

                if not received_sample.get("isCurrentExpressionResult"):
                    LOG.warning(
                        f"Skipping out-of-date results for sample «{received_sample_barcode}»"
                    )
                    continue

                # Barcode must match a known identifier
                received_sample_identifier = sample_identifier(
                    db, received_sample_barcode)

                if not received_sample_identifier:
                    LOG.warning(
                        f"Skipping results for sample without a known identifier «{received_sample_barcode}»"
                    )
                    continue

                # Track Samplify's internal ids for our samples, which is
                # unfortunately necessary for linking genomic data NWGC also
                # sends.
                sample = update_sample(
                    db,
                    identifier=received_sample_identifier,
                    additional_details=sample_details(received_sample))

                # Finally, process all results.
                for test_result in test_results:
                    test_result_target_id = test_result["geneTarget"]
                    LOG.debug(
                        f"Processing target «{test_result_target_id}» for \
                    sample «{received_sample_barcode}»")

                    # Skip this result if it's actually a non-result
                    present = target_present(test_result)

                    if present is ...:
                        LOG.debug(
                            f"No test result for «{test_result_target_id}», skipping"
                        )
                        continue

                    # Most of the time we expect to see existing targets so a
                    # select-first approach makes the most sense to avoid useless
                    # updates.
                    target = find_or_create_target(
                        db,
                        identifier=test_result_target_id,
                        control=target_control(test_result["controlStatus"]))

                    # The unique identifier for each result.  If chip is
                    # applicable, then it's included to differentiate the same
                    # sample being run on multiple chips (uncommon, but it
                    # happens).
                    if chip:
                        identifier = f"NWGC/{received_sample_id}/{target.identifier}/{chip}"
                    else:
                        identifier = f"NWGC/{received_sample_id}/{target.identifier}"

                    # Most of the time we expect to see new samples and new
                    # presence_absence tests, so an insert-first approach makes more sense.
                    # Presence-absence tests we see more than once are presumed to be
                    # corrections.
                    upsert_presence_absence(db,
                                            identifier=identifier,
                                            sample_id=sample.id,
                                            target_id=target.id,
                                            present=present,
                                            details=presence_absence_details(
                                                test_result,
                                                group.received_date, chip,
                                                extraction_date, assay_name,
                                                assay_date, assay_type))

            mark_processed(db, group.id)

            LOG.info(f"Finished processing presence_absence group {group.id}")
示例#15
0
def etl_manifest(*, db: DatabaseSession):
    LOG.debug(f"Starting the manifest ETL routine, revision {REVISION}")

    # XXX TODO: Stop hardcoding valid identifier sets.  Instead, accept them as
    # an option or config (and validate option choices against what's actually
    # in the database).  We won't want to validate using click.option(),
    # because that would necessitate a database connection simply to run
    # bin/id3c at all.
    #   -trs, 13 May 2019
    expected_identifier_sets = {
        "samples": {"samples"},
        "collections": {
            "collections-environmental",
            "collections-fluathome.org",
            "collections-household-intervention",
            "collections-household-intervention-asymptomatic",
            "collections-household-observation",
            "collections-household-observation-asymptomatic",
            "collections-kiosks",
            "collections-kiosks-asymptomatic",
            "collections-seattleflu.org",
            "collections-swab&send",
            "collections-swab&send-asymptomatic",
            "collections-self-test",
            "collections-scan",
            "collections-scan-kiosks",
            "collections-haarvi",
            "samples-haarvi",
            "collections-validation",
            "collections-uw-home",
            "collections-uw-observed",
            "collections-household-general",
            "collections-childcare",
            "collections-school-testing-home",
            "collections-school-testing-observed",
            "collections-apple-respiratory",
            "collections-apple-respiratory-serial",
            "collections-adult-family-home-outbreak",
            "collections-workplace-outbreak",
        },
        "rdt": {"collections-fluathome.org"}
    }

    # Fetch and iterate over samples that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same samples.
    LOG.debug("Fetching unprocessed manifest records")

    manifest = db.cursor("manifest")
    manifest.execute(
        """
        select manifest_id as id, document
          from receiving.manifest
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "etl": ETL_NAME,
            "revision": REVISION
        }]), ))

    for manifest_record in manifest:
        with db.savepoint(f"manifest record {manifest_record.id}"):
            LOG.info(f"Processing record {manifest_record.id}")

            # When updating an existing row, update the identifiers
            # only if the record has both the 'sample' and
            # 'collection' keys.
            should_update_identifiers = "sample" in manifest_record.document \
                and "collection" in manifest_record.document

            # Sample collection date
            # Don't pop this entry off the document. For backwards
            # compatibility reasons, keep it in the document so that 'date'
            # also gets written to the 'details' column in warehouse.sample.
            collected_date = manifest_record.document.get("date", None)

            # Attempt to find barcodes and their related identifiers
            sample_barcode = manifest_record.document.pop("sample", None)
            sample_identifier = find_identifier(
                db, sample_barcode) if sample_barcode else None
            collection_barcode = manifest_record.document.pop(
                "collection", None)
            collection_identifier = find_identifier(
                db, collection_barcode) if collection_barcode else None

            # Skip a record if it has no associated barcodes
            if not sample_barcode and not collection_barcode:
                LOG.warning(
                    f"Skipping record «{manifest_record.id}» because it has neither a sample "
                    "barcode nor a collection barcode")
                mark_skipped(db, manifest_record.id)
                continue

            # Skip a record if it has a sample barcode but the barcode doesn't match an identifier
            if sample_barcode and not sample_identifier:
                LOG.warning(
                    f"Skipping sample with unknown sample barcode «{sample_barcode}»"
                )
                mark_skipped(db, manifest_record.id)
                continue

            # Skip a record if it has a collection barcode but the barcode doesn't match an identifier
            if collection_barcode and not collection_identifier:
                LOG.warning(
                    f"Skipping sample with unknown collection barcode «{collection_barcode}»"
                )
                mark_skipped(db, manifest_record.id)
                continue

            # Skip a record if the collection identifier is from an unexpected set
            if collection_identifier and collection_identifier.set_name not in expected_identifier_sets[
                    "collections"]:
                LOG.warning(
                    f"Skipping sample because collection identifier found in set «{collection_identifier.set_name}», not \
                    {expected_identifier_sets['collections']}")
                mark_skipped(db, manifest_record.id)
                continue

            # Validate the sample identifer and assert if a record fails
            if sample_identifier:
                if (manifest_record.document.get("sample_type")
                        and manifest_record.document["sample_type"] == "rdt"):
                    assert sample_identifier.set_name in expected_identifier_sets["rdt"], \
                        (f"Sample identifier found in set «{sample_identifier.set_name}»," +
                        f"not {expected_identifier_sets['rdt']}")
                else:
                    assert sample_identifier.set_name in expected_identifier_sets["samples"], \
                        (f"Sample identifier found in set «{sample_identifier.set_name}», " +
                        f"not {expected_identifier_sets['samples']}")

            # Upsert sample cooperatively with enrollments ETL routine
            #
            # The details document was intentionally modified by two pop()s
            # earlier to remove barcodes that were looked up.
            # The rationale is that we want just one clear place in the
            # warehouse for each piece of information.
            sample, status = upsert_sample(
                db,
                update_identifiers=should_update_identifiers,
                identifier=sample_identifier.uuid
                if sample_identifier else None,
                collection_identifier=collection_identifier.uuid
                if collection_identifier else None,
                collection_date=collected_date,
                additional_details=manifest_record.document)

            mark_loaded(db,
                        manifest_record.id,
                        status=status,
                        sample_id=sample.id)

            LOG.info(
                f"Finished processing manifest record {manifest_record.id}")
示例#16
0
def etl_consensus_genome(*, db: DatabaseSession):
    LOG.debug(
        f"Starting the consensus genome ETL routine, revision {REVISION}")

    # Fetch and iterate over consensus genome records that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same consensus genome records.
    LOG.debug("Fetching unprocessed consensus genome records")

    consensus_genome = db.cursor("consensus genome")
    consensus_genome.execute(
        """
        select consensus_genome_id as id, document, received
          from receiving.consensus_genome
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "revision": REVISION
        }]), ))

    for record in consensus_genome:
        with db.savepoint(f"consensus genome record {record.id}"):
            LOG.info(f"Processing consensus genome record {record.id}")

            # Verify sample identifier is in the database
            sample = find_sample(db,
                                 record.document["sample_identifier"],
                                 for_update=False)

            assert sample, f"No sample found with identifier «{record.document['sample_identifier']}»"

            # Most of the time we expect to see existing sequence read sets,
            # but we also want to update the details log. However, the
            # only unique constraint on the sequence_read_set table is
            # defined within a trigger function, as Postgres does not allow
            # unique constraints on array columns. Therefore, perform a
            # find-or-create followed by an update to the details column to
            # avoid conflict.
            sequence_read_set = find_or_create_sequence_read_set(
                db, record.document, sample)

            status = record.document.get("status")

            # Find the matching organism within the warehouse for the
            # reference organism
            organism_name = get_lineage(db, record.document)
            organism = find_organism(db, organism_name)

            assert organism, f"No organism found with name «{organism_name}»"

            # Only upsert genome and genomic sequences if the assembly job
            # was marked as complete.
            if status == 'complete':
                # Most of the time we expect to see new sequences, so an
                # insert-first approach makes the most sense to avoid useless
                # queries.
                genome = upsert_genome(db,
                                       sequence_read_set=sequence_read_set,
                                       organism=organism,
                                       document=record.document)

                for masked_consensus in record.document['masked_consensus']:
                    genomic_sequence = upsert_genomic_sequence(
                        db, genome=genome, masked_consensus=masked_consensus)

            update_sequence_read_set_details(db, sequence_read_set.id,
                                             organism, status)
            mark_processed(db, record.id, {"status": "processed"})

            LOG.info(
                f"Finished processing consensus genome record {record.id}")
def notify(*, action: str):
    LOG.debug(f"Starting the reportable conditions notification routine, revision {REVISION}")

    db = DatabaseSession()

    def webhook(suffix):
        return os.environ.get("SLACK_WEBHOOK_ALERTS_TEST") \
            or os.environ[f"SLACK_WEBHOOK_REPORTING_{suffix}"]

    slack_webhooks = {
        "ncov-reporting": webhook("HCOV19"),
    }

    projects = [
        {
            "collection_sets": {"collections-childcare"},
            "slack_channel_name": "ncov-reporting-childcare",
            "slack_webhook": webhook("HCOV19_CHILDCARE"),
        },

        {
            "collection_sets": {"collections-apple-respiratory"},
            "slack_channel_name": "ncov-reporting-apple",
            "slack_webhook": webhook("HCOV19_APPLE"),
        },

        {
            "collection_sets": {"collections-school-testing-home",
              "collections-school-testing-observed",
              "collections-radxup-yakima-schools-home",
              "collections-radxup-yakima-schools-observed"},
            "slack_channel_name": "ncov-reporting-schools",
            "slack_webhook": webhook("HCOV19_SCHOOLS"),
        },

        {
            "collection_sets": {"collections-adult-family-home-outbreak"},
            "slack_channel_name": "ncov-reporting-afh",
            "slack_webhook": webhook("HCOV19_AFH"),
        },

        {
            "collection_sets": {"collections-workplace-outbreak"},
            "slack_channel_name": "ncov-reporting-workplace",
            "slack_webhook": webhook("HCOV19_WORKPLACE"),
        }
    ]

    # Fetch and iterate over reportable condition records that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same reportable condition records.
    LOG.debug("Fetching unprocessed reportable conditions records")

    reportable_conditions = db.cursor("reportable_conditions")
    reportable_conditions.execute("""
        select reportable_condition_v1.*, presence_absence_id as id
            from shipping.reportable_condition_v1
            join warehouse.presence_absence using (presence_absence_id)
        where details @> %s is not true
        order by id
            for update of presence_absence;
        """, (Json({"reporting_log":[{ "revision": REVISION }]}),))

    processed_without_error = None

    try:
        for record in reportable_conditions:
            with db.savepoint(f"reportable condition presence_absence_id {record.id}"):
                LOG.info(f"Processing reportable condition, presence_absence_id «{record.id}»")

                if not record.site:
                    LOG.info(f"No site found for presence_absence_id «{record.id}». " +
                        "Inferring site from manifest data.")

                responses = {'ncov-reporting': send_slack_post_request(record, slack_webhooks['ncov-reporting'])}

                # Also send study-specific results to their respective channels
                for project in projects:
                    if (record.collection_set_name in project['collection_sets']):
                        responses[project['slack_channel_name']] = send_slack_post_request(
                            record, project['slack_webhook'])

                # Check all POSTs to Slack were successful to mark as processed
                # This does mean that if one fails but others succeed, there
                # will be duplicate POSTs to the already succeeded channels.
                # The chance of this happening is pretty small, but we can
                # revisit this if it becomes a common problem
                #   -Jover, 21 October 2020
                if all(response.status_code == 200 for response in responses.values()):
                    mark_processed(db, record.id, {"status": "sent Slack notification"})
                    LOG.info(f"Finished processing presence_absence_id «{record.id}»")

                else:
                    for channel, response in responses.items():
                        if response.status_code != 200:
                            LOG.error(("Error: A Slack notification could not " \
                            f"be sent to the channel «{channel}» for "
                            f"presence_absence_id «{record.id}».\n" \
                            f"Slack API returned status code {response.status_code}: "\
                            f"{response.text}"))

    except Exception as error:
        processed_without_error = False

        LOG.error(f"Aborting with error")
        raise error from None

    else:
        processed_without_error = True

    finally:
        if action == "prompt":
            ask_to_commit = \
                "Commit all changes?" if processed_without_error else \
                "Commit successfully processed reportable condition records up to this point?"

            commit = click.confirm(ask_to_commit)
        else:
            commit = action == "commit"

        if commit:
            LOG.info(
                "Committing all changes" if processed_without_error else \
                "Committing successfully processed reportable condition records up to this point")
            db.commit()

        else:
            LOG.info("Rolling back all changes; the database will not be modified")
            db.rollback()
示例#18
0
        def decorated(*args, db: DatabaseSession, log_output: bool, **kwargs):
            LOG.debug(
                f"Starting the REDCap DET ETL routine {name}, revision {revision}"
            )

            redcap_det = db.cursor(f"redcap-det {name}")
            redcap_det.execute(
                """
                select redcap_det_id as id, document
                  from receiving.redcap_det
                 where not processing_log @> %s
                   and document::jsonb @> %s
                 order by id
                   for update
                """, (Json([etl_id]), Json(det_contains)))

            with pickled_cache(CACHE_FILE) as cache:
                for det in redcap_det:
                    with db.savepoint(f"redcap_det {det.id}"):
                        LOG.info(f"Processing REDCap DET {det.id}")

                        instrument = det.document['instrument']

                        # Only pull REDCap record if
                        # `include_incomplete` flag was not included and
                        # the current instrument is complete
                        if not include_incomplete and not is_complete(
                                instrument, det.document):
                            LOG.debug(
                                f"Skipping incomplete or unverified REDCap DET {det.id}"
                            )
                            mark_skipped(db, det.id, etl_id)
                            continue

                        redcap_record = get_redcap_record_from_det(
                            det.document, raw_coded_values)

                        if not redcap_record:
                            LOG.debug(
                                f"REDCap record is missing or invalid.  Skipping REDCap DET {det.id}"
                            )
                            mark_skipped(db, det.id, etl_id)
                            continue

                        # Only process REDCap record if all required instruments are complete
                        incomplete_instruments = {
                            instrument
                            for instrument in required_instruments
                            if not is_complete(instrument, redcap_record)
                        }

                        if incomplete_instruments:
                            LOG.debug(f"The following required instruments «{incomplete_instruments}» are not yet marked complete. " + \
                                      f"Skipping REDCap DET {det.id}")
                            mark_skipped(db, det.id, etl_id)
                            continue

                        bundle = routine(db=db,
                                         cache=cache,
                                         det=det,
                                         redcap_record=redcap_record)

                        if not bundle:
                            mark_skipped(db, det.id, etl_id)
                            continue

                        if log_output:
                            print(as_json(bundle))

                        insert_fhir_bundle(db, bundle)
                        mark_loaded(db, det.id, etl_id, bundle['id'])
示例#19
0
def etl_enrollments(*, db: DatabaseSession):
    LOG.debug(f"Starting the enrollment ETL routine, revision {REVISION}")

    # Fetch and iterate over enrollments that aren't processed
    #
    # Use a server-side cursor by providing a name.  This ensures we limit how
    # much data we fetch at once, to limit local process size.  Each enrollment
    # document is ~10 KB and the default fetch size (cursor.itersize) is 2,000,
    # thus we'll get ~20 MB on each fetch of 2,000 enrollments.
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same enrollments.
    LOG.debug("Fetching unprocessed enrollments")

    enrollments = db.cursor("enrollments")
    enrollments.execute(
        """
        select enrollment_id as id, document
          from receiving.enrollment
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "etl": ETL_NAME,
            "revision": REVISION
        }]), ))

    for enrollment in enrollments:
        with db.savepoint(f"enrollment {enrollment.id}"):
            LOG.info(f"Processing enrollment {enrollment.id}")

            # Out of an abundance of caution, fail when the schema version
            # of the enrollment document changes.  This ensures manual
            # intervention happens on document structure changes.  After
            # seeing how versions are handled over time, this restriction
            # may be toned down a bit.
            known_versions = {"1.1.0", "1.0.0"}

            assert enrollment.document["schemaVersion"] in known_versions, \
                f"Document schema version {enrollment.document['schemaVersion']} is not in {known_versions}"

            # Most of the time we expect to see existing sites so a
            # select-first approach makes the most sense to avoid useless
            # updates.
            site = find_or_create_site(
                db,
                identifier=enrollment.document["site"]["name"],
                details=site_details(enrollment.document["site"]))

            # Most of the time we expect to see new individuals and new
            # encounters, so an insert-first approach makes more sense.
            # Encounters we see more than once are presumed to be
            # corrections.
            individual = upsert_individual(
                db,
                identifier=enrollment.document["participant"],
                sex=assigned_sex(enrollment.document))

            encounter = upsert_encounter(
                db,
                identifier=enrollment.document["id"],
                encountered=enrollment.document["startTimestamp"],
                individual_id=individual.id,
                site_id=site.id,
                age=age(enrollment.document),
                details=encounter_details(enrollment.document))

            process_samples(db, encounter.id, enrollment.document)
            process_locations(db, encounter.id, enrollment.document)

            mark_processed(db, enrollment.id)

            LOG.info(f"Finished processing enrollment {enrollment.id}")
示例#20
0
def etl_clinical(*, db: DatabaseSession):
    LOG.debug(f"Starting the clinical ETL routine, revision {REVISION}")

    # Fetch and iterate over clinical records that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same clinical records.
    LOG.debug("Fetching unprocessed clinical records")

    clinical = db.cursor("clinical")
    clinical.execute("""
        select clinical_id as id, document
          from receiving.clinical
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{ "revision": REVISION }]),))

    for record in clinical:
        with db.savepoint(f"clinical record {record.id}"):
            LOG.info(f"Processing clinical record {record.id}")

            # Check validity of barcode
            received_sample_identifier = sample_identifier(db,
                record.document["barcode"])

            # Skip row if no matching identifier found
            if received_sample_identifier is None:
                LOG.info("Skipping due to unknown barcode " + \
                          f"{record.document['barcode']}")
                mark_skipped(db, record.id)
                continue

            # Check sample exists in database
            sample = find_sample(db,
                identifier = received_sample_identifier)

            # Skip row if sample does not exist
            if sample is None:
                LOG.info("Skipping due to missing sample with identifier " + \
                            f"{received_sample_identifier}")
                mark_skipped(db, record.id)
                continue

            # Most of the time we expect to see existing sites so a
            # select-first approach makes the most sense to avoid useless
            # updates.
            site = find_or_create_site(db,
                identifier = site_identifier(record.document["site"]),
                details    = {"type": "retrospective"})


            # Most of the time we expect to see new individuals and new
            # encounters, so an insert-first approach makes more sense.
            # Encounters we see more than once are presumed to be
            # corrections.
            individual = upsert_individual(db,
                identifier  = record.document["individual"],
                sex         = sex(record.document["AssignedSex"]))

            encounter = upsert_encounter(db,
                identifier      = record.document["identifier"],
                encountered     = record.document["encountered"],
                individual_id   = individual.id,
                site_id         = site.id,
                age             = age(record.document),
                details         = encounter_details(record.document))

            sample = update_sample(db,
                sample = sample,
                encounter_id = encounter.id)

            # Link encounter to a Census tract, if we have it
            tract_identifier = record.document.get("census_tract")

            if tract_identifier:
                # Special-case float-like identifiers in earlier date
                tract_identifier = re.sub(r'\.0$', '', str(tract_identifier))

                tract = find_location(db, "tract", tract_identifier)
                assert tract, f"Tract «{tract_identifier}» is unknown"

                upsert_encounter_location(db,
                    encounter_id = encounter.id,
                    relation = "residence",
                    location_id = tract.id)

            mark_processed(db, record.id, {"status": "processed"})

            LOG.info(f"Finished processing clinical record {record.id}")
示例#21
0
        def decorated(*args,
                      db: DatabaseSession,
                      log_output: bool,
                      det_limit: int = None,
                      redcap_api_batch_size: int,
                      geocoding_cache: str = None,
                      **kwargs):
            LOG.debug(
                f"Starting the REDCap DET ETL routine {name}, revision {revision}"
            )

            project = Project(redcap_url, project_id)

            if det_limit:
                LOG.debug(f"Processing up to {det_limit:,} pending DETs")
                limit = sql.Literal(det_limit)
            else:
                LOG.debug(f"Processing all pending DETs")
                limit = sql.SQL("all")

            redcap_det = db.cursor(f"redcap-det {name}")
            redcap_det.execute(
                sql.SQL("""
                select redcap_det_id as id, document
                  from receiving.redcap_det
                 where not processing_log @> %s
                   and document::jsonb @> %s
                 order by id
                 limit {}
                   for update
                """).format(limit), (Json([etl_id]), Json(det_contains)))

            # First loop of the DETs to determine how to process each one.
            # Uses `first_complete_dets` to keep track of which DET to
            # use to process a unique REDCap record.
            # Uses `all_dets` to keep track of the status for each DET record
            # so that they can be processed in order of `redcap_det_id` later.
            #   --Jover, 21 May 2020
            first_complete_dets: Dict[str, Any] = {}
            all_dets: List[Dict[str, str]] = []
            for det in redcap_det:
                instrument = det.document['instrument']
                record_id = det.document['record']
                # Assume we are loading all DETs
                # Status will be updated to "skip" if DET does not need to be processed
                det_record = {"id": det.id, "status": "load"}

                # Only pull REDCap record if
                # `include_incomplete` flag was not included and
                # the current instrument is complete
                if not include_incomplete and not is_complete(
                        instrument, det.document):
                    det_record.update({
                        "status": "skip",
                        "reason": "incomplete/unverified DET"
                    })

                # Check if this is record has an older DET
                # Skip latest DET in favor of the first DET
                # This is done to continue our first-in-first-out
                # semantics of our receiving tables
                elif first_complete_dets.get(record_id):
                    det_record.update({
                        "status": "skip",
                        "reason": "repeat REDCap record"
                    })

                else:
                    first_complete_dets[record_id] = det
                    det_record["record_id"] = record_id

                all_dets.append(det_record)

            if not first_complete_dets:
                LOG.info("No new complete DETs found.")
            else:
                # Batch request records from REDCap
                LOG.info(f"Fetching REDCap project {project_id}")
                record_ids = list(first_complete_dets.keys())

                LOG.info(
                    f"Fetching {len(record_ids):,} REDCap records from project {project.id}"
                )

                # Convert list of REDCap records to a dict so that
                # records can be looked up by record id.
                # Records with repeating instruments or longitudinal
                # events will have multiple entries in the list.
                redcap_records: DefaultDict[str,
                                            List[dict]] = defaultdict(list)

                batches = list(chunked(record_ids, redcap_api_batch_size))

                for i, batch in enumerate(batches, 1):
                    LOG.info(
                        f"Fetching REDCap record batch {i:,}/{len(batches):,} of size {len(batch):,}"
                    )

                    for record in project.records(ids=batch,
                                                  raw=raw_coded_values):
                        redcap_records[record.id].append(record)

            # Process all DETs in order of redcap_det_id
            with pickled_cache(geocoding_cache) as cache:
                for det in all_dets:
                    with db.savepoint(f"redcap_det {det['id']}"):
                        LOG.info(f"Processing REDCap DET {det['id']}")

                        if det["status"] == "skip":
                            LOG.debug(
                                f"Skipping REDCap DET {det['id']} due to {det['reason']}"
                            )
                            mark_skipped(db, det["id"], etl_id, det["reason"])
                            continue

                        received_det = first_complete_dets.pop(
                            det["record_id"])
                        redcap_record_instances = redcap_records.get(
                            received_det.document["record"])

                        if not redcap_record_instances:
                            LOG.debug(
                                f"REDCap record is missing or invalid.  Skipping REDCap DET {received_det.id}"
                            )
                            mark_skipped(db, received_det.id, etl_id,
                                         "invalid REDCap record")
                            continue

                        bundle = routine(
                            db=db,
                            cache=cache,
                            det=received_det,
                            redcap_record_instances=redcap_record_instances)

                        if not bundle:
                            LOG.debug(
                                f"Skipping REDCap DET {received_det.id} due to insufficient data in REDCap record."
                            )
                            mark_skipped(db, received_det.id, etl_id,
                                         "insufficient data in record")
                            continue

                        if log_output:
                            print(as_json(bundle))

                        insert_fhir_bundle(db, bundle)
                        mark_loaded(db, received_det.id, etl_id, bundle['id'])