Exemplo n.º 1
0
def main(argv=None):
    timer = time.time()

    extractor = DataExtractor()

    # Output file
    featureMatrixFile = stdOpen("featureMatrix.SepsisICU.encounters.tab.gz",
                                "w")

    # Final columns to output to patient matrix
    colNames = list()

    patientEpisodes = extractor.parsePatientEpisodeFile(
        stdOpen("patientEpisodes.tab"), colNames)
    #patientIds = set(columnFromModelList(patientEpisodes, "patient_id"));

    log.info("Expand to index dates based start and end dates")
    # But only want one entry per patient
    patientByIndexTimeById = extractor.generateDateRangeIndexTimes(
        "edAdmitTime",
        "dischargeTime",
        patientEpisodes,
        colNames,
        timeInterval=None)

    log.info("Populate flowsheet summary statistics")
    flowsheetByNameByPatientId = extractor.parseFlowsheetFile(
        stdOpen("Flowsheet.tab.gz"))
    extractor.addFlowsheetFeatures(patientByIndexTimeById,
                                   flowsheetByNameByPatientId, FLOWSHEET_NAMES,
                                   FLOWSHEET_PRE_TIME_DELTA,
                                   FLOWSHEET_POST_TIME_DELTA, colNames)

    log.info("Populate laboratory result summary statistics")
    labsByBaseNameByPatientId = extractor.parseLabResultsFile(
        stdOpen("LabResults.tab.gz"))
    extractor.addLabFeatures(patientByIndexTimeById, labsByBaseNameByPatientId,
                             LAB_BASE_NAMES, LAB_PRE_TIME_DELTA,
                             LAB_POST_TIME_DELTA, colNames)

    log.info("Populate IV Fluid accumulation")
    ivFluidsByPatientId = extractor.parseIVFluidFile(
        stdOpen("IsotonicIVFluids.tab.gz"))
    extractor.addIVFluidFeatures(patientByIndexTimeById, ivFluidsByPatientId,
                                 IVF_THRESHOLD_VOLUMES, IVF_CHECKPOINT_TIMES,
                                 colNames)

    log.info(
        "Record presence of items in terms of relative time to each item from index time"
    )
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("IVAntibiotic.tab")),
        patientByIndexTimeById, colNames, "IVAntibiotic")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("BloodCulture.tab")),
        patientByIndexTimeById, colNames, "BloodCulture")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RespViralPanel.tab")),
        patientByIndexTimeById, colNames, "RespViralPanel")

    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyICULifeSupport.tab")),
        patientByIndexTimeById, colNames, "AnyICULifeSupport")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyDNR.tab")),
        patientByIndexTimeById, colNames, "AnyDNR")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVasoactive.tab")),
        patientByIndexTimeById, colNames, "AnyVasoactive")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyCRRT.tab")),
        patientByIndexTimeById, colNames, "AnyCRRT")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVentilator.tab")),
        patientByIndexTimeById, colNames, "AnyVentilator")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("ComfortCare.tab")),
        patientByIndexTimeById, colNames, "ComfortCare")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("PalliativeConsult.tab")),
        patientByIndexTimeById, colNames, "PalliativeConsult")

    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Death.tab")),
        patientByIndexTimeById, colNames, "Death")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Birth.tab")),
        patientByIndexTimeById, colNames, "Birth")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Male.tab")),
        patientByIndexTimeById, colNames, "Male")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Female.tab")),
        patientByIndexTimeById, colNames, "Female")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteNonHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteNonHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceAsian.tab")),
        patientByIndexTimeById, colNames, "RaceAsian")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceHispanicLatino.tab")),
        patientByIndexTimeById, colNames, "RaceHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceUnknown.tab")),
        patientByIndexTimeById, colNames, "RaceUnknown")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceOther.tab")),
        patientByIndexTimeById, colNames, "RaceOther")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceBlack.tab")),
        patientByIndexTimeById, colNames, "RaceBlack")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RacePacificIslander.tab")),
        patientByIndexTimeById, colNames, "RacePacificIslander")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceNativeAmerican.tab")),
        patientByIndexTimeById, colNames, "RaceNativeAmerican")

    log.info(
        "Systemically Scan for Charlson comorbidities and Treatment Team categories"
    )
    for filename in os.listdir("."):
        if filename.startswith(CHARLSON_PREFIX):
            diseaseName = filename
            if filename.endswith(".tab"):
                diseaseName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, diseaseName)

        if filename.startswith(TREATMENT_TEAM_PREFIX):
            teamName = filename
            if filename.endswith(".tab"):
                teamName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, teamName)

    log.info("Output feature matrix file with row per patient day")
    formatter = TextResultsFormatter(featureMatrixFile)
    formatter.formatTuple(colNames)
    for patientId, patientByIndexTime in patientByIndexTimeById.iteritems():
        patientResults = patientByIndexTime.values()
        formatter.formatResultDicts(patientResults, colNames)

    timer = time.time() - timer
    print >> sys.stderr, "%.3f seconds to complete" % timer
Exemplo n.º 2
0
def main(argv):
    timer = time.time()

    extractor = DataExtractor()

    patientEpisodes = queryPatientEpisodes(stdOpen("patientEpisodes.tab", "w"),
                                           extractor)
    # Maybe just do this first time, then comment out and load from file with line below
    #patientEpisodes = extractor.parsePatientEpisodeFile(stdOpen("patientEpisodes.tab"), list()); # Read from prior file if main query already done to avoid expensive query
    patientIds = set(columnFromModelList(patientEpisodes, "patient_id"))

    extractor.queryFlowsheet(FLOWSHEET_NAMES, patientIds,
                             stdOpen("Flowsheet.tab.gz", "w"))
    extractor.queryLabResults(LAB_BASE_NAMES, patientIds,
                              stdOpen("LabResults.tab.gz", "w"))

    # Look for specific IV fluid medication subset
    ivfMedIds = set()
    for row in extractor.loadMapData("Medication.IVFluids"):
        if row["group"] == "isotonic":
            ivfMedIds.add(row["medication_id"])
    extractor.queryIVFluids(ivfMedIds, patientIds,
                            stdOpen("IsotonicIVFluids.tab.gz", "w"))

    extractor.queryClinicalItems(loadIVAntibioticItemIds(extractor),
                                 patientIds, stdOpen("IVAntibiotic.tab", "w"))
    extractor.queryClinicalItems(loadBloodCultureItemIds(extractor),
                                 patientIds, stdOpen("BloodCulture.tab", "w"))
    extractor.queryClinicalItems(loadRespiratoryViralPanelItemIds(extractor),
                                 patientIds, stdOpen("RespViralPanel.tab",
                                                     "w"))

    extractor.queryClinicalItemsByName(("AnyICULifeSupport", ), patientIds,
                                       stdOpen("AnyICULifeSupport.tab", "w"))
    extractor.queryClinicalItemsByName(("AnyDNR", ), patientIds,
                                       stdOpen("AnyDNR.tab", "w"))
    extractor.queryClinicalItemsByName(("AnyVasoactive", ), patientIds,
                                       stdOpen("AnyVasoactive.tab", "w"))
    extractor.queryClinicalItemsByName(("AnyCRRT", ), patientIds,
                                       stdOpen("AnyCRRT.tab", "w"))
    extractor.queryClinicalItemsByName(("AnyVentilator", ), patientIds,
                                       stdOpen("AnyVentilator.tab", "w"))
    extractor.queryClinicalItemsByName(("^Comfort Care", ),
                                       patientIds,
                                       stdOpen("ComfortCare.tab", "w"),
                                       col="description",
                                       operator="~*")
    extractor.queryClinicalItemsByName(('consult.*palliative', ),
                                       patientIds,
                                       stdOpen("PalliativeConsult.tab", "w"),
                                       col="description",
                                       operator="~*")

    extractor.queryClinicalItemsByName(("Death", ), patientIds,
                                       stdOpen("Death.tab", "w"))
    extractor.queryClinicalItemsByName(("Birth", ), patientIds,
                                       stdOpen("Birth.tab", "w"))
    extractor.queryClinicalItemsByName(("Male", ), patientIds,
                                       stdOpen("Male.tab", "w"))
    extractor.queryClinicalItemsByName(("Female", ), patientIds,
                                       stdOpen("Female.tab", "w"))
    extractor.queryClinicalItemsByName(
        ("RaceWhiteNonHispanicLatino", ), patientIds,
        stdOpen("RaceWhiteNonHispanicLatino.tab", "w"))
    extractor.queryClinicalItemsByName(("RaceAsian", ), patientIds,
                                       stdOpen("RaceAsian.tab", "w"))
    extractor.queryClinicalItemsByName(
        ("RaceWhiteHispanicLatino", ), patientIds,
        stdOpen("RaceWhiteHispanicLatino.tab", "w"))
    extractor.queryClinicalItemsByName(("RaceHispanicLatino", ), patientIds,
                                       stdOpen("RaceHispanicLatino.tab", "w"))
    extractor.queryClinicalItemsByName(("RaceUnknown", ), patientIds,
                                       stdOpen("RaceUnknown.tab", "w"))
    extractor.queryClinicalItemsByName(("RaceOther", ), patientIds,
                                       stdOpen("RaceOther.tab", "w"))
    extractor.queryClinicalItemsByName(("RaceBlack", ), patientIds,
                                       stdOpen("RaceBlack.tab", "w"))
    extractor.queryClinicalItemsByName(("RacePacificIslander", ), patientIds,
                                       stdOpen("RacePacificIslander.tab", "w"))
    extractor.queryClinicalItemsByName(("RaceNativeAmerican", ), patientIds,
                                       stdOpen("RaceNativeAmerican.tab", "w"))

    # Extract out lists of ICD9 prefixes per disease category
    icd9prefixesByDisease = dict()
    for row in extractor.loadMapData("CharlsonComorbidity-ICD9CM"):
        (disease, icd9prefix) = (row["charlson"], row["icd9cm"])
        if disease not in icd9prefixesByDisease:
            icd9prefixesByDisease[disease] = list()
        icd9prefixesByDisease[disease].append("^ICD9." + icd9prefix)
    for disease, icd9prefixes in icd9prefixesByDisease.items():
        disease = disease.translate(None, " ()-/")
        # Strip off punctuation
        extractor.queryClinicalItemsByName(icd9prefixes,
                                           patientIds,
                                           stdOpen(
                                               "Charlson." + disease + ".tab",
                                               "w"),
                                           operator="~*")

    # Extract out lists of treatment team names per care category
    teamNameByCategory = dict()
    for row in extractor.loadMapData("TreatmentTeamGroups"):
        (category, teamName) = (row["team_category"], row["treatment_team"])
        if category not in teamNameByCategory:
            teamNameByCategory[category] = list()
        teamNameByCategory[category].append(teamName)
    for category, teamNames in teamNameByCategory.items():
        extractor.queryClinicalItemsByName(teamNames,
                                           patientIds,
                                           stdOpen("TT." + category + ".tab",
                                                   "w"),
                                           col="description")

    timer = time.time() - timer
    print("%.3f seconds to complete" % timer, file=sys.stderr)
Exemplo n.º 3
0
    def test_performance(self):
        """
        Test performance against DataExtractor.
        """
        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Initialize FeatureMatrixFactory.
        factoryStart = time.time()
        self.factory = FeatureMatrixFactory()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereIn("proc_code",
                                       ["Foo", "Bar", "Baz", "Qux"])
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()

        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)

        # Add clinical item features.
        self.factory.addClinicalItemFeatures(["PerfItem300"])
        self.factory.addClinicalItemFeatures(["PerfItem400"])
        self.factory.addClinicalItemFeatures(["PerfItem500"])

        # Add lab result features.
        self.factory.addLabResultFeatures(["Foo"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Bar"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Baz"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Qux"], False, preTimeDelta,
                                          postTimeDelta)

        # Add flowsheet features.
        self.factory.addFlowsheetFeatures(["Perflow"], preTimeDelta,
                                          postTimeDelta)

        # Build matrix.
        self.factory.buildFeatureMatrix()

        # Stop timer.
        factoryStop = time.time()

        # Initialize DataExtractor.
        extractorStart = time.time()
        extractor = DataExtractor()
        extractor.dataCache = dict()

        # Initialize output file.
        outFile = open("extractor.feature_matrix.tab.gz", "w")
        formatter = TextResultsFormatter(outFile)

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereIn("proc_code",
                                       ["Foo", "Bar", "Baz", "Qux"])
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Process patient episodes.
        patientEpisodes = list()
        row = cursor.fetchone()

        while row is not None:
            (pat_id, order_proc_id, proc_code, order_time,
             normal_results) = row
            patientEpisode = \
                RowItemModel \
                (
                    {
                        "patient_id": pat_id,
                        "order_proc_id": order_proc_id,
                        "proc_code": proc_code,
                        "order_time": order_time,
                        "result_normal_count": normal_results
                    }
                )
            patientEpisodes.append(patientEpisode)
            row = cursor.fetchone()

        # Initialize patient data.
        lastPatientId = None
        colNames = None
        patientEpisodeByIndexTime = None

        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)

        # Populate patient data.
        tempColNames = \
            ["patient_id", "order_proc_id", "proc_code", "order_time",
                "result_normal_count"]
        for patientEpisode in patientEpisodes:
            patientId = patientEpisode["patient_id"]

            if lastPatientId is not None and lastPatientId != patientId:
                # New patient ID so start querying for patient specific data and
                # populating patient episode data.

                # Clinical Item (PerfItem300)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem300",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Clinical Item (PerfItem400)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem400",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Clinical Item (PerfItem500)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem500",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Lab Result (Foo)
                labResultTable = extractor.queryLabResults(["Foo"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Bar)
                labResultTable = extractor.queryLabResults(["Bar"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Baz)
                labResultTable = extractor.queryLabResults(["Baz"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Qux)
                labResultTable = extractor.queryLabResults(["Qux"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \
                    preTimeDelta, postTimeDelta))

                # Flowsheet (Perflow)
                # tempFile = StringIO()
                # labResultTable = extractor.queryFlowsheet(["Perflow"], [patientId], tempFile)
                # flowsheetByNameByPatientId = extractor.parseFlowsheetFile(\
                #     StringIO(tempFile.getvalue()))
                # tempColNames.extend(extractor.addFlowsheetFeatures_singlePatient(\
                #     patientEpisodeByIndexTime, flowsheetByNameByPatientId[patientId], \
                #     ["Perflow"], preTimeDelta, postTimeDelta, tempColNames))

                if colNames is None:
                    # First row, print header row
                    colNames = tempColNames
                    formatter.formatTuple(colNames)

                # Print out patient (episode) data (one row per episode)
                formatter.formatResultDicts(patientEpisodeByIndexTime.values(),
                                            colNames)

            if lastPatientId is None or lastPatientId != patientId:
                # Prepare to aggregate patient episode record per patient
                patientEpisodeByIndexTime = dict()

            patientEpisodeByIndexTime[
                patientEpisode["order_time"]] = patientEpisode
            lastPatientId = patientId
            outFile.flush()

        # Last Iteration
        patientId = lastPatientId
        # Clinical Item (PerfItem300)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem300",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Clinical Item (PerfItem400)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem400",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Clinical Item (PerfItem500)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem500",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Lab Result (Foo)
        labResultTable = extractor.queryLabResults(["Foo"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Bar)
        labResultTable = extractor.queryLabResults(["Bar"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Baz)
        labResultTable = extractor.queryLabResults(["Baz"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Qux)
        labResultTable = extractor.queryLabResults(["Qux"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \
            preTimeDelta, postTimeDelta))

        formatter.formatResultDicts(patientEpisodeByIndexTime.values(),
                                    colNames)

        # Close file.
        outFile.close()

        # Stop timer.
        extractorStop = time.time()

        # Compare results.
        factoryTime = factoryStop - factoryStart
        extractorTime = extractorStop - extractorStart
        self.assertTrue(extractorTime > factoryTime)

        # Clean up feature matrix files.
        try:
            os.remove("extractor.feature_matrix.tab.gz")
        except OSError:
            pass
        try:
            os.remove(self.factory.getMatrixFileName())
        except OSError:
            pass
Exemplo n.º 4
0
def main(argv=None):
    timer = time.time()

    extractor = DataExtractor()

    # Final columns to output to patient matrix
    colNames = list()

    patientById = extractor.parsePatientFile(stdOpen("patients.tab"), colNames)

    log.info("Expand to index dates based start and end dates")
    patientByIndexTimeById = extractor.generateDateRangeIndexTimes(
        "firstLifeSupportDate", "lastContiguousDate",
        list(patientById.values()), colNames)

    log.info("Populate flowsheet summary statistics")
    flowsheetByNameByPatientId = extractor.parseFlowsheetFile(
        stdOpen("Flowsheet.tab.gz"))
    extractor.addFlowsheetFeatures(patientByIndexTimeById,
                                   flowsheetByNameByPatientId, FLOWSHEET_NAMES,
                                   FLOWSHEET_PRE_TIME_DELTA,
                                   FLOWSHEET_POST_TIME_DELTA, colNames)

    log.info("Populate laboratory result summary statistics")
    labsByBaseNameByPatientId = extractor.parseLabResultsFile(
        stdOpen("LabResults.tab.gz"))
    extractor.addLabFeatures(patientByIndexTimeById, labsByBaseNameByPatientId,
                             LAB_BASE_NAMES, LAB_PRE_TIME_DELTA,
                             LAB_POST_TIME_DELTA, colNames)

    log.info(
        "Record presence of items in terms of relative time to each item from index time"
    )
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyICULifeSupport.tab")),
        patientByIndexTimeById, colNames, "AnyICULifeSupport")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyDNR.tab")),
        patientByIndexTimeById, colNames, "AnyDNR")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVasoactive.tab")),
        patientByIndexTimeById, colNames, "AnyVasoactive")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyCRRT.tab")),
        patientByIndexTimeById, colNames, "AnyCRRT")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVentilator.tab")),
        patientByIndexTimeById, colNames, "AnyVentilator")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("ComfortCare.tab")),
        patientByIndexTimeById, colNames, "ComfortCare")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("PalliativeConsult.tab")),
        patientByIndexTimeById, colNames, "PalliativeConsult")

    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Death.tab")),
        patientByIndexTimeById, colNames, "Death")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Birth.tab")),
        patientByIndexTimeById, colNames, "Birth")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Male.tab")),
        patientByIndexTimeById, colNames, "Male")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Female.tab")),
        patientByIndexTimeById, colNames, "Female")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteNonHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteNonHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceAsian.tab")),
        patientByIndexTimeById, colNames, "RaceAsian")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceHispanicLatino.tab")),
        patientByIndexTimeById, colNames, "RaceHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceUnknown.tab")),
        patientByIndexTimeById, colNames, "RaceUnknown")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceOther.tab")),
        patientByIndexTimeById, colNames, "RaceOther")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceBlack.tab")),
        patientByIndexTimeById, colNames, "RaceBlack")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RacePacificIslander.tab")),
        patientByIndexTimeById, colNames, "RacePacificIslander")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceNativeAmerican.tab")),
        patientByIndexTimeById, colNames, "RaceNativeAmerican")

    log.info(
        "Systemically Scan for Charlson comorbidities and Treatment Team categories"
    )
    for filename in os.listdir("."):
        if filename.startswith(CHARLSON_PREFIX):
            diseaseName = filename
            if filename.endswith(".tab"):
                diseaseName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, diseaseName)

        if filename.startswith(TREATMENT_TEAM_PREFIX):
            teamName = filename
            if filename.endswith(".tab"):
                teamName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, teamName)

    log.info("Output feature matrix file with row per patient day")
    featureMatrixFile = stdOpen("featureMatrix.ICUDNR.tab.gz", "w")
    formatter = TextResultsFormatter(featureMatrixFile)
    for patientId, patientByIndexTime in patientByIndexTimeById.items():
        patientResults = list(patientByIndexTime.values())
        formatter.formatResultDicts(patientResults,
                                    colNames,
                                    addHeaderRow=True)

    timer = time.time() - timer
    print("%.3f seconds to complete" % timer, file=sys.stderr)
Exemplo n.º 5
0
def main(argv):
    timer = time.time();
    
    extractor = DataExtractor();

    #patientById = queryPatients(stdOpen("patients.tab","w"));
    patientById = extractor.parsePatientFile(stdOpen("patients.tab"), list()); # Read from prior file if main query already done to avoid expensive query

    extractor.queryFlowsheet(FLOWSHEET_NAMES, patientById, stdOpen("Flowsheet.tab.gz","w"));

    extractor.queryLabResults(LAB_BASE_NAMES, patientById, stdOpen("LabResults.tab.gz","w"));

    extractor.queryClinicalItemsByName(("AnyICULifeSupport",), patientById, stdOpen("AnyICULifeSupport.tab","w"));
    extractor.queryClinicalItemsByName(("AnyDNR",), patientById, stdOpen("AnyDNR.tab","w"));
    extractor.queryClinicalItemsByName(("AnyVasoactive",), patientById, stdOpen("AnyVasoactive.tab","w"));
    extractor.queryClinicalItemsByName(("AnyCRRT",), patientById, stdOpen("AnyCRRT.tab","w"));
    extractor.queryClinicalItemsByName(("AnyVentilator",), patientById, stdOpen("AnyVentilator.tab","w"));
    extractor.queryClinicalItemsByName(("^Comfort Care",), patientById, stdOpen("ComfortCare.tab","w"), col="description", operator="~*");
    extractor.queryClinicalItemsByName(('consult.*palliative',), patientById, stdOpen("PalliativeConsult.tab","w"), col="description", operator="~*");

    extractor.queryClinicalItemsByName(("Death",), patientById, stdOpen("Death.tab","w"));
    extractor.queryClinicalItemsByName(("Birth",), patientById, stdOpen("Birth.tab","w"));
    extractor.queryClinicalItemsByName(("Male",), patientById, stdOpen("Male.tab","w"));
    extractor.queryClinicalItemsByName(("Female",), patientById, stdOpen("Female.tab","w"));
    extractor.queryClinicalItemsByName(("RaceWhiteNonHispanicLatino",), patientById, stdOpen("RaceWhiteNonHispanicLatino.tab","w"));
    extractor.queryClinicalItemsByName(("RaceAsian",), patientById, stdOpen("RaceAsian.tab","w"));
    extractor.queryClinicalItemsByName(("RaceWhiteHispanicLatino",), patientById, stdOpen("RaceWhiteHispanicLatino.tab","w"));
    extractor.queryClinicalItemsByName(("RaceHispanicLatino",), patientById, stdOpen("RaceHispanicLatino.tab","w"));
    extractor.queryClinicalItemsByName(("RaceUnknown",), patientById, stdOpen("RaceUnknown.tab","w"));
    extractor.queryClinicalItemsByName(("RaceOther",), patientById, stdOpen("RaceOther.tab","w"));
    extractor.queryClinicalItemsByName(("RaceBlack",), patientById, stdOpen("RaceBlack.tab","w"));
    extractor.queryClinicalItemsByName(("RacePacificIslander",), patientById, stdOpen("RacePacificIslander.tab","w"));
    extractor.queryClinicalItemsByName(("RaceNativeAmerican",), patientById, stdOpen("RaceNativeAmerican.tab","w"));

    # Extract out lists of ICD9 prefixes per disease category
    icd9prefixesByDisease = dict();
    for line in StringIO(CHARLSON_ICD9_MAP_TEXT):
        (disease, icd9prefix) = line.strip().split("\t");
        if disease not in icd9prefixesByDisease:
            icd9prefixesByDisease[disease] = list();
        icd9prefixesByDisease[disease].append("^ICD9."+icd9prefix);
    for disease, icd9prefixes in icd9prefixesByDisease.items():
        disease = disease.translate(None," ()-/");   # Strip off punctuation
        extractor.queryClinicalItemsByName(icd9prefixes, patientById, stdOpen("Charlson."+disease+".tab","w"), operator="~*");
    
    # Extract out lists of treatment team names per care category
    teamNameByCategory = dict();
    for line in StringIO(TREATMENT_TEAM_MAP_TEXT):
        (category, teamName) = line.strip().split("\t");
        if category not in teamNameByCategory:
            teamNameByCategory[category] = list();
        teamNameByCategory[category].append(teamName);
    for category, teamNames in teamNameByCategory.items():
        extractor.queryClinicalItemsByName(teamNames, patientById, stdOpen("TT."+category+".tab","w"), col="description");
    
    timer = time.time() - timer;
    print("%.3f seconds to complete" % timer, file=sys.stderr);