예제 #1
0
    def queryItems(self, options, outputFile):
        """Query for all clinical item records that fulfill the options criteria
        and then send the results as tab-delimited output to the outputFile.
        """
        pauseSeconds = float(options.pauseSeconds)

        query = SQLQuery()
        query.addSelect("count(order_med_id_coded) as nOrders")
        query.addSelect("om.med_route, om.medication_id, om.med_description")
        query.addFrom("starr_datalake2018.order_med as om")
        if options.descriptionPrefix:
            query.addWhereOp("om.med_description", "like",
                             options.descriptionPrefix + "%%")
            # Add wildcard to enabe prefix search
        if options.medRoutes:
            query.addWhereIn("om.med_route", options.medRoutes.split(","))
        query.addGroupBy("om.medication_id, om.med_description, om.med_route")
        query.addOrderBy("nOrders desc, om.med_description")

        formatter = TextResultsFormatter(outputFile)

        prog = ProgressDots()
        for row in DBUtil.execute(query,
                                  includeColumnNames=True,
                                  connFactory=self.connFactory):
            formatter.formatTuple(row)
            time.sleep(pauseSeconds)
            prog.update()
        prog.printStatus()
예제 #2
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog <inputFile> <outputFile>\n"+\
                    "   <inputFile>     Tab-delimited input file taken from schedule Excel file. Example data format as seen in test case examples. See support/extractExcelSheets.py for help on pulling out Excel sheets into tab-delimited data files.\n"+\
                    "   <outputFile>    File to output results to.  Designate '-' for stdout.";
        parser = OptionParser(usage=usageStr)
        parser.add_option("-i", "--providerIdFilename",  dest="providerIdFilename", help="Name of provider ID CSV file. If provided, then add column for prov_id based on resident first_name and last_name, match within first "+DEFAULT_INDEX_PREFIX_LENGTH+" characters, or generate ID value if no match found");
        parser.add_option("-y", "--baseYear",  dest="baseYear", help="Year expect dates to start in.");
        parser.add_option("-t", "--changeTime",  dest="changeTime", default=CHANGE_TIME, help="Hour of day that count as delimiter between rotations. Likely should NOT be midnight = 0, because night shifts span midnight. Default to 7 = 7am.");
        (options, args) = parser.parse_args(argv[1:])

        if len(args) >= 2 and options.baseYear:
            log.info("Starting: "+str.join(" ", argv))
            timer = time.time();

            baseYear = int(options.baseYear);

            if options.providerIdFilename is not None:
                providerReader = csv.DictReader(open(options.providerIdFilename));
                self.loadProviderModels( providerReader );

            inFile = stdOpen(args[0]);
            scheduleItems = self.parseScheduleItems(inFile, baseYear);

            outFile = stdOpen(args[1],"w");
            formatter = TextResultsFormatter(outFile);
            formatter.formatResultDicts(scheduleItems);
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer;
        log.info("%.3f seconds to complete",timer);
예제 #3
0
    def queryItems(self, options, outputFile):
        """Query for all clinical item records that fulfill the options criteria
        and then send the results as tab-delimited output to the outputFile.
        """
        pauseSeconds = float(options.pauseSeconds)

        query = SQLQuery()
        query.addSelect(
            "cic.description, ci.clinical_item_id, ci.name, ci.description")
        query.addFrom("clinical_item_category as cic")
        query.addFrom("clinical_item as ci")
        query.addWhere(
            "cic.clinical_item_category_id = ci.clinical_item_category_id")
        if options.itemPrefix:
            query.addWhereOp("ci.description", "like",
                             options.itemPrefix + "%%")
            # Add wildcard to enabe prefix search
        if options.categoryNames:
            query.addWhereIn("cic.description",
                             options.categoryNames.split(","))
        query.addOrderBy(
            "cic.description, ci.name, ci.description, ci.clinical_item_id")

        formatter = TextResultsFormatter(outputFile)

        prog = ProgressDots()
        for row in DBUtil.execute(query,
                                  includeColumnNames=True,
                                  connFactory=self.connFactory):
            formatter.formatTuple(row)
            time.sleep(pauseSeconds)
            prog.update()
        prog.printStatus()
예제 #4
0
파일: formatData.py 프로젝트: xxxx3/CDSS
def main(argv=None):
    timer = time.time()

    # Final columns to output to patient matrix
    colNames = list()

    patientById = parsePatientFile(stdOpen("patients.tab"), colNames)

    labsByBaseNameByPatientId = parseLabResultsFile(stdOpen("labs.tab"))
    addLabFeatures(labsByBaseNameByPatientId, patientById, colNames,
                   INDEX_ITEM_BASE_NAME, LAB_BASE_NAMES, LAB_PRE_TIME,
                   LAB_POST_TIME)

    log.info(
        "Record presence of items in terms of relative time to each item from index time"
    )
    itemTimesByPatientId = parseClinicalItemFile(stdOpen("admitDx.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ICD9.208-AdmitDx")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("problemListDx.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ICD9.208-ProblemListDx")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("feSO4Rx.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironSO4")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("allEnteralIron.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironEnteral")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("ironIV.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironIV")

    itemTimesByPatientId = parseClinicalItemFile(
        stdOpen("outpatientIronRx.tab"),
        patientIdCol="pat_id",
        timeCol="ordering_date")
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironOutpatient")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("transfusions.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "RBCTransfusion")

    patientResults = filterPatients(patientById)

    log.info("Output feature matrix file with row per patient")
    featureMatrixFile = stdOpen("featureMatrix.lab14to1day.tab", "w")
    formatter = TextResultsFormatter(featureMatrixFile)
    formatter.formatResultDicts(patientResults, colNames, addHeaderRow=True)

    timer = time.time() - timer
    print("%.3f seconds to complete" % timer, file=sys.stderr)
예제 #5
0
    def action_default(self):
        # Read checkboxes by presence or absence of field
        self.requestData[
            "incCols"] = ""  # Checkboxes not passed if unchecked, so extra step to ensure uncheck is persisted
        incCols = False
        if self.mForm.has_key("incCols"):
            self.requestData["incCols"] = self.mForm["incCols"].value
            incCols = True

        # Point to the specified database
        connFactory = self.connectionFactory()

        timer = time.time()
        # Just execute a normal query, possibly with a result set
        results = DBUtil.execute(self.mForm["input"].value,
                                 includeColumnNames=incCols,
                                 connFactory=connFactory)
        if type(results) == list:  # Result set, format as table
            formatter = TextResultsFormatter(StringIO())
            formatter.formatResultSet(results)
            self.requestData["resultsText"] = formatter.getOutFile().getvalue()

            headerRowFormat = None
            if incCols: headerRowFormat = "th"

            formatter = HtmlResultsFormatter(StringIO(), headerRowFormat)
            formatter.formatResultSet(results)
            self.requestData["resultsHtml"] = formatter.getOutFile().getvalue()

            self.requestData["resultsInfo"] = "(%d rows) " % len(results)
        else:
            self.requestData[
                "resultsText"] = "%d rows affected (or other return code)" % results
        timer = time.time() - timer
        self.requestData["resultsInfo"] += "(%1.3f seconds)" % timer
예제 #6
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <trainFile> <testFile> [<outputFile>]\n"+\
                    "   <trainFile> Tab-delimited file, queryItemIdsJSON expected to be parseable into lists of query items as well as an outcome.X column\n"+\
                    "   <testFile> Same structure as trainFile, but with test cases to assess prediction scoring\n"+\
                    "   <outputFile>    Tab-delimited that can be used for ROC analysis with columns for outcome and predicted score\n"+\
                    ""
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-o",
            "--outcomeItemId",
            dest="outcomeItemId",
            help="Outcome item IDs to assess get prediction scores for")

        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 2:
            trainFile = stdOpen(args[0])
            testFile = stdOpen(args[1])

            outcomeId = int(options.outcomeItemId)

            # Run the actual analysis
            (featureMatrix, outcomeMatrix, queryIds,
             rowModels) = self.fileToMatrixes(trainFile, outcomeId)
            model = self.train(featureMatrix, outcomeMatrix)
            analysisResults = self.predict(testFile, model, queryIds,
                                           outcomeId)

            # Format the results for output
            outputFilename = None
            if len(args) > 2:
                outputFilename = args[2]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            print(COMMENT_TAG, json.dumps({"argv": argv}), file=outputFile)

            colNames = self.analysisHeaders(outcomeId)
            analysisResults.insert(0, RowItemModel(colNames, colNames))
            # Insert a mock record to get a header / label row

            formatter = TextResultsFormatter(outputFile)
            formatter.formatResultDicts(analysisResults, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
예제 #7
0
def queryPatients(outputFile):
    log.info("Select patients with any result for a ferritin test")
    patientById = dict()
    query = \
        """select distinct pat_id
        from 
          stride_order_results as sor,
          stride_order_proc as sop
        where 
          sor.order_proc_id = sop.order_proc_id and
          base_name = 'ferritin'
        """
    results = DBUtil.execute(query)
    for (patientId, ) in results:
        patientId = int(patientId)
        patientById[patientId] = RowItemModel({"patient_id": patientId})

    log.info("Patients with admit or diet orders for surgery")
    # Not perfectly accurate for isolating surgical patients
    for patient in patientById.itervalues():
        patient["surgery"] = 0
        # Default to 0 / false
    query = \
        """select distinct patient_id
        from patient_item
        where clinical_item_id in (3614,4177,4220)
        """
    results = DBUtil.execute(query)
    for (patientId, ) in results:
        if patientId in patientById:
            patientById[patientId]["surgery"] = 1

    log.info("Patients with an order for dialysis")
    # (Does not differentiate acute vs. chronic.  Includes peritoneal)
    for patient in patientById.itervalues():
        patient["dialysis"] = 0
        # Default to 0 / false
    query = \
        """select distinct patient_id
        from patient_item
        where clinical_item_id in (1815,3783,4322)
        """
    results = DBUtil.execute(query)
    for (patientId, ) in results:
        if patientId in patientById:
            patientById[patientId]["dialysis"] = 1

    # Drop results as tab-delimited text output
    formatter = TextResultsFormatter(outputFile)
    formatter.formatResultDicts(patientById.itervalues(), addHeaderRow=True)

    return patientById
예제 #8
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile1> <inputFile2> ... <inputFileN>\n"+\
                    "   <inputFileX>    Tab-delimited file of data.  Initial comment lines will be scanned for list of argv parameters to add as data columns.\n"+\
                    "                   If only a single input is given, interpret this as an index file which lists the names of the other files to concatenate (e.g., obtained with dir * /b or ls).\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-o",
            "--outputFile",
            dest="outputFile",
            help=
            "Tab-delimited file matching concatenated contents of input files.  Specify \"-\" to send to stdout."
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 0:
            inputFiles = list()
            if len(args) > 1:
                for inputFilename in args:
                    inputFiles.append(stdOpen(inputFilename))
            else:  # len(argvs) == 1, Single index file rather than list of all files on command-line
                indexFile = stdOpen(args[0])
                for line in indexFile:
                    inputFilename = line.strip()
                    inputFiles.append(stdOpen(inputFilename))

            # Format the results for output
            outputFile = stdOpen(options.outputFile, "w")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            summaryData = {
                "argv": argv
            }
            print >> outputFile, COMMENT_TAG, json.dumps(summaryData)

            # Tab-delimited output formatting
            formatter = TextResultsFormatter(outputFile)

            # Begin the file parsing so can at least get the total list of column headers
            rowGenerator = self(inputFiles)
            firstRow = rowGenerator.next()

            # Insert a mock record to get a header / label row
            colNames = self.resultHeaders()
            formatter.formatTuple(colNames)

            # Stream the concatenated data rows to the output to avoid storing all in memory
            formatter.formatResultDict(firstRow, colNames)
            for outputDict in rowGenerator:
                formatter.formatResultDict(outputDict, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
예제 #9
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> [<outputFile>]\n"+\
                    "   <inputFile> Tab-delimited file, first two labeled columns expected to represent labeled outcome (0 and non-zero) and score/probability of outcome\n"+\
                    "   <outputFile>    Tab-delimited table specifying score histogram bin widths, total cases, predicted events, actual events\n"+\
                    "                       Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option("-b", "--bins",  dest="nBins",  default=10,    help="Number of bins to separate scores into, defaults to deciles (10)");
        parser.add_option("-f", "--figure",  dest="figure",  help="If set, will also try to auto-generate an example figure and store to a file here");

        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: "+str.join(" ", argv))
        timer = time.time();
        if len(args) > 1:
            inputFilename = args[0];
            inputFile = stdOpen(inputFilename);
            
            # Run the actual analysis
            analysisResults = self(inputFile, int(options.nBins));
            
            (hlStat, degFreedom, hlP) = self.calculateHosmerLemeshow(analysisResults);
            
            # Generate plot figure
            if options.figure is not None:
                self.generateFigure(analysisResults, options.figure);

            # Format the results for output
            outputFilename = None;
            if len(args) > 1:
                outputFilename = args[1];
            outputFile = stdOpen(outputFilename,"w");
            
            # Print comment line with arguments to allow for deconstruction later as well as extra results
            print >> outputFile, COMMENT_TAG, json.dumps({"argv":argv, "P-HosmerLemeshow": hlP});

            colNames = self.analysisHeaders();
            analysisResults.insert(0, RowItemModel(colNames,colNames) );    # Insert a mock record to get a header / label row
            
            formatter = TextResultsFormatter( outputFile );
            formatter.formatResultDicts( analysisResults, colNames );

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer;
        log.info("%.3f seconds to complete",timer);
예제 #10
0
def queryOutpatientIronRx(outputFile, patientById):
    log.info("Query outpatient Iron prescriptions")

    # Medication IDs derived by mapping through Iron as an ingredient
    poIronIngredientMedicationIds = (3065, 3066, 3067, 3071, 3074, 3077, 3986,
                                     7292, 11050, 25006, 26797, 34528, 39676,
                                     78552, 79674, 83568, 84170, 85151, 96118,
                                     112120, 112395, 113213, 126035, 198511,
                                     200455, 201994, 201995, 203679, 207059,
                                     207404, 208037, 208072)
    # Medication IDs directly from prescriptions, formulations that did not map through RxNorm
    poIronDirectMedicationIds = (111354, 540526, 205010, 121171, 111320, 82791,
                                 93962, 201795, 206722, 201068, 116045, 208725,
                                 111341, 206637, 112400, 210256, 77529, 20844,
                                 83798, 205523, 112428, 125474, 111343)
    allEnteralIronMedicationIds = set(poIronIngredientMedicationIds).union(
        poIronDirectMedicationIds)

    formatter = TextResultsFormatter(outputFile)

    colNames = ["pat_id", "ordering_date"]

    query = SQLQuery()
    for col in colNames:
        query.addSelect(col)
    query.addFrom("stride_order_med")
    query.addWhereIn("medication_id", allEnteralIronMedicationIds)
    query.addWhereIn("pat_id", patientById.viewkeys())
    query.addOrderBy("pat_id")
    query.addOrderBy("ordering_date")

    DBUtil.execute(query, includeColumnNames=True, formatter=formatter)
예제 #11
0
def queryLabResults(outputFile, patientById):
    log.info("Query out lab results, takes a while")
    labBaseNames = \
    (   'ferritin','fe','trfrn','trfsat','ystfrr',
        'wbc','hgb','hct','mcv','rdw','plt',
        'retic','reticab','ldh','hapto','tbil','ibil','dbil',
        'cr','esr','crp'
    )

    formatter = TextResultsFormatter(outputFile)

    # Query rapid when filter by lab result type, limited to X records.
    # Filtering by patient ID drags down substantially until preloaded table by doing a count on the SOR table?
    colNames = [
        "pat_id", "base_name", "common_name", "ord_num_value",
        "reference_unit", "result_flag", "sor.result_time"
    ]

    query = SQLQuery()
    for col in colNames:
        query.addSelect(col)
    query.addFrom("stride_order_results as sor, stride_order_proc as sop")
    query.addWhere("sor.order_proc_id = sop.order_proc_id")
    query.addWhereIn("base_name", labBaseNames)
    query.addWhereIn("pat_id", patientById.viewkeys())
    query.addOrderBy("pat_id")
    query.addOrderBy("sor.result_time")

    DBUtil.execute(query, includeColumnNames=True, formatter=formatter)
예제 #12
0
def main_formatMergedTTests(argv):
    ifs = stdOpen(BASE_RESULT_DIR+CONCATENATE_FILENAME);
    ofs = stdOpen(BASE_RESULT_DIR+FILTERED_FILENAME, "w");

    summaryData = {"argv": argv};
    print >> ofs, COMMENT_TAG, json.dumps(summaryData);

    outputCols = ["SortType","TopicCount","VerifyTime","Group1.precision.mean","Group1.recall.mean","Group1.normalprecision.mean","Group1.weightrecall.mean","Group1.roc_auc.mean","ttest_rel.precision","ttest_rel.recall","ttest_rel.weightrecall","ttest_rel.roc_auc","Group1.numqueryitems.mean","Group1.numverifyitems.mean","Group1.numrecommendeditems.mean","Group1.tp.mean"];
    formatter = TextResultsFormatter(ofs);
    formatter.formatTuple(outputCols);  # Output header row

    reader = TabDictReader(ifs);
    for row in reader:
        row["SortType"] = row["Group1._s"];

        # Extract out numerical data from filename text parameters
        row["TopicCount"] = None;
        if row["Group1._m"] != 'None':
            # Expecting model name strings of the form: "models/topicModel.first24hourItems.2013.1234567890.filter.bow.gz.64Topic.model"
            topicChunk = row["Group1._m"].split(".")[-2];   # Expect second to last period-delimited chunk to contain topic count
            topicChunk = topicChunk[:topicChunk.find("Topic")]; # Remove trailing Topic text
            row["TopicCount"] = int(topicChunk);

        # Expecting result file name argument of the form: "results/byOrderSets/01minutes/filteredResults.tab.gz"
        timeChunk = row["args[0]"].split("/")[-2];
        timeChunk = timeChunk[:timeChunk.find("minutes")];
        row["VerifyTime"] = int(timeChunk);

        formatter.formatResultDict(row, outputCols);

    ifs.close();
    ofs.close();
    def test_numRecsByOrderSet(self):
        # Designate number of recommendations indirectly via linked order set id 

        DBUtil.execute("update clinical_item set default_recommend = 0 where clinical_item_id = -8");   # Disable default recommend on one item to shift results

        colNames = ["patient_id", "TP", "FN", "FP",  "recall", "precision", "F1-score", "weightRecall","weightPrecision", "ROC-AUC"];
        expectedResults = [ RowItemModel([-11111, 2, 0, 3, 1.0, 0.4, 0.571,  1.0, 0.3178, 0.4167], colNames ) ];

        # Do through fabricated prepared file intermediary
        sys.stdout = StringIO();    
        argv = ["PreparePatientItems.py","-q","2","-v","3",'0,-11111',"-"];
        self.preparer.main(argv);
        preparedDataFile = StringIO(sys.stdout.getvalue());
        
        # Artificially add a key order set ID for the fabricated data
        modFile = StringIO();
        formatter = TextResultsFormatter(modFile);
        dataCols = None;
        for i, dataRow in enumerate(TabDictReader(preparedDataFile)):
            dataRow["order_set_id"] = TEST_ORDERSET_ID;
            if i <= 0:
                dataCols = list(dataRow.keys());
                formatter.formatTuple(dataCols);    # Insert a mock record to get a header / label row
            formatter.formatResultDict(dataRow, dataCols);
        preparedDataFile = StringIO(modFile.getvalue());

        sys.stdin = preparedDataFile;   # Read prepared data file from redirected stdin
        sys.stdout = StringIO();
        #argv = ["RecommendationClassificationAnalysis.py","-P","-r","5","-m","0","-R","ItemAssociationRecommender",'-',"-"];
        argv = ["RecommendationClassificationAnalysis.py","-P","--numRecsByOrderSet","-m","0","-R","ItemAssociationRecommender",'-',"-"];
        self.analyzer.main(argv);
        textOutput = StringIO(sys.stdout.getvalue());
        self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames);
예제 #14
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> [<outputFile>]\n"+\
                    "   <inputFile>    Validation file in prepared result file format use generated LDA models to predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\
                    "   <outputFile>   Validation result stat summaries.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option("-M", "--modelFile",  dest="modelFile", help="Name of the file to load an LDA or HDP model and topic word document counts from.");
        parser.add_option("-X", "--excludeCategoryIds",  dest="excludeCategoryIds", help="For recommendation, exclude / skip any items who fall under one of these comma-separated category Ids.");
        parser.add_option("-i", "--itemsPerCluster",  dest="itemsPerCluster", default=DEFAULT_TOPIC_ITEM_COUNT, help="Specify number of top topic items to consider when scoring recommendations.");
        parser.add_option("-m", "--minClusterWeight",  dest="minClusterWeight", default=DEFAULT_MIN_TOPIC_WEIGHT, help="When scoring recommendations, skip any topics with less than this relation weight (effectively scores as zero, but can avoid a lot of low yield calculations).");
        parser.add_option("-s", "--sortField",  dest="sortField", default=DEFAULT_SORT_FIELD, help="Score field to sort top recommendations by.  Default to posterior probabilty 'totelItemWeight', but can also select 'lift' = 'tfidf' = 'interest' for TF*IDF style score weighting.");
        parser.add_option("-r", "--numRecs",   dest="numRecs",  default=DEFAULT_RECOMMENDED_ITEM_COUNT, help="Number of orders / items to recommend for comparison against the verification set. Alternative set option numRecsByOrderSet to look for key order set usage and size.");
        parser.add_option("-O", "--numRecsByOrderSet",   dest="numRecsByOrderSet", action="store_true", help="If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider.");
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: "+str.join(" ", argv))
        timer = time.time();
        if len(args) >= 1:
            query = AnalysisQuery();
            query.preparedPatientItemFile = stdOpen(args[0]);
            query.recommender = TopicModelRecommender(options.modelFile);
            query.baseRecQuery = RecommenderQuery();
            if options.excludeCategoryIds is not None:
                query.baseRecQuery.excludeCategoryIds = set();
                for categoryIdStr in options.executeCategoryIds.split(","):
                    query.baseRecQuery.excludeCategoryIds.add(int(categoryIdStr));
            else:   # Default exclusions if none specified
                query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds();
                query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds();
            query.baseRecQuery.itemsPerCluster = int(options.itemsPerCluster);
            query.baseRecQuery.minClusterWeight = float(options.minClusterWeight);

            query.baseRecQuery.sortField = options.sortField;
            query.numRecommendations = int(options.numRecs);
            query.numRecsByOrderSet = options.numRecsByOrderSet;

            # Run the actual analysis
            analysisResults = self(query);

            # Format the results for output
            outputFilename = None;
            if len(args) > 1:
                outputFilename = args[1];
            outputFile = stdOpen(outputFilename,"w");

            # Print comment line with analysis arguments to allow for deconstruction later
            summaryData = {"argv": argv};
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile);

            formatter = TextResultsFormatter( outputFile );
            colNames = self.resultHeaders(query);
            formatter.formatTuple( colNames );  # Insert a mock record to get a header / label row
            formatter.formatResultDicts( analysisResults, colNames );
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer;
        log.info("%.3f seconds to complete",timer);
예제 #15
0
    def outputSummaryRecords(self, summaryRecords, outputFile):
        # Field names with or without using data control links instead of just raw values
        headers = list(self.summaryHeaders)
        controlHeaders = list(self.summaryHeaders)
        for questionModule in self.questionModules:
            headers.append(questionModule.getName())
            controlHeaders.append(questionModule.getName() + ".link")

        print('''<br>
            <table class="dataTable" cellspacing=0 cellpadding=4 style="width: 100%">
            <tr><th class="subheading" colspan=100>Summary Table</th></tr>''',
              file=outputFile)

        textAreaRows = 50
        if not self.skipDetail:
            # HTML table form with links back to records
            formatter = HtmlResultsFormatter(
                outputFile,
                headerRowFormat='th class="labelCell"',
                align="center")
            formatter.formatTuple(headers)
            # Header row
            for summaryRecord in summaryRecords:
                for questionModule in self.questionModules:
                    questionName = questionModule.getName()
                    linkFieldName = questionName + ".link"
                    summaryRecord[linkFieldName] = (
                        '<a href="javascript:setQuestionsByName(\'' +
                        questionName + '\', %(iRecord)s)">%(' + questionName +
                        ')s</a>') % summaryRecord
                formatter.formatResultDict(summaryRecord, controlHeaders)

            textAreaRows = 5
            # If showing detail records, pay less attention to the raw text area

        # Raw result content for copy-paste to spreadsheet
        print(
            '''<tr><td class="labelCell" style="color: 808080" colspan=100>Raw Table (Select All and Copy-Paste to Spreadsheet)</td></tr>''',
            file=outputFile)
        print(
            '''<tr><td colspan=100><textarea style="width: 100%%;" disabled rows=%d>'''
            % textAreaRows,
            file=outputFile)
        formatter = TextResultsFormatter(outputFile)
        formatter.formatTuple(headers)
        for summaryRecord in summaryRecords:
            formatter.formatResultDict(summaryRecord, headers)
        print('''</textarea></td></tr>''', file=outputFile)

        print('''</table>''', file=outputFile)
        print("%d Records Processed" % len(summaryRecords), file=outputFile)
예제 #16
0
def queryClinicalItems(outputFile, clinicalItemIds, patientById):
    log.info("Query Clinical Items: %s" % str(clinicalItemIds))
    formatter = TextResultsFormatter(outputFile)

    colNames = ["patient_id", "item_date"]

    query = SQLQuery()
    for col in colNames:
        query.addSelect(col)
    query.addFrom("patient_item")
    query.addWhereIn("clinical_item_id", clinicalItemIds)
    query.addWhereIn("patient_id", patientById.viewkeys())
    query.addOrderBy("patient_id")
    query.addOrderBy("item_date")

    DBUtil.execute(query, includeColumnNames=True, formatter=formatter)
def main_formatResults(argv):
    ifs = stdOpen(BASE_RESULT_DIR + FILTERED_FILENAME)
    ofs = stdOpen(BASE_RESULT_DIR + FORMATTED_FILENAME, "w")

    summaryData = {
        "argv": argv
    }
    print >> ofs, COMMENT_TAG, json.dumps(summaryData)

    outputCols = [
        "SortType", "TopicCount", "TrainTime", "VerifyTime", "precision",
        "recall", "normalprecision", "weightrecall", "roc_auc"
    ]
    formatter = TextResultsFormatter(ofs)
    formatter.formatTuple(outputCols)
    # Output header row

    reader = TabDictReader(ifs)
    for row in reader:
        row["SortType"] = row["_s"]

        # Extract out numerical data from filename text parameters
        row["TopicCount"] = None
        row["TrainTime"] = None
        if row["_m"] != 'None':
            # Expecting model name strings of the form: "models/topicModel.firstItems.q14400.v14400.2013.1234567890.filter.bow.gz.16Topic.model"
            chunks = row["_m"].split(".")
            topicChunk = chunks[-2]
            # Expect second to last period-delimited chunk to contain topic count
            topicChunk = topicChunk[:topicChunk.find("Topic")]
            # Remove trailing Topic text
            row["TopicCount"] = int(topicChunk)

            for chunk in chunks:
                if chunk[0] == "q" and chunk[-1].isdigit(
                ):  # This should be the query time in seconds
                    queryTimeSeconds = int(chunk[1:])
                    queryTimeMinutes = queryTimeSeconds / 60
                    row["TrainTime"] = queryTimeMinutes

        # Expecting training file name argument of the form: "sourceData/first24hourOrderSets.2013.q86400.v14400.-12345.tab.gz"
        row["VerifyTime"] = None
        for chunk in row["args_0_"].split("."):
            if chunk[0] == "v" and chunk[-1].isdigit(
            ):  # This should be the verify time in seconds
                verifyTimeSeconds = int(chunk[1:])
                verifyTimeMinutes = verifyTimeSeconds / 60
                row["VerifyTime"] = verifyTimeMinutes

        formatter.formatResultDict(row, outputCols)

    ifs.close()
    ofs.close()
예제 #18
0
    def test_performance(self):
        """
        Test performance against DataExtractor.
        """
        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Initialize FeatureMatrixFactory.
        factoryStart = time.time()
        self.factory = FeatureMatrixFactory()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereIn("proc_code",
                                       ["Foo", "Bar", "Baz", "Qux"])
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()

        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)

        # Add clinical item features.
        self.factory.addClinicalItemFeatures(["PerfItem300"])
        self.factory.addClinicalItemFeatures(["PerfItem400"])
        self.factory.addClinicalItemFeatures(["PerfItem500"])

        # Add lab result features.
        self.factory.addLabResultFeatures(["Foo"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Bar"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Baz"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Qux"], False, preTimeDelta,
                                          postTimeDelta)

        # Add flowsheet features.
        self.factory.addFlowsheetFeatures(["Perflow"], preTimeDelta,
                                          postTimeDelta)

        # Build matrix.
        self.factory.buildFeatureMatrix()

        # Stop timer.
        factoryStop = time.time()

        # Initialize DataExtractor.
        extractorStart = time.time()
        extractor = DataExtractor()
        extractor.dataCache = dict()

        # Initialize output file.
        outFile = open("extractor.feature_matrix.tab.gz", "w")
        formatter = TextResultsFormatter(outFile)

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereIn("proc_code",
                                       ["Foo", "Bar", "Baz", "Qux"])
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Process patient episodes.
        patientEpisodes = list()
        row = cursor.fetchone()

        while row is not None:
            (pat_id, order_proc_id, proc_code, order_time,
             normal_results) = row
            patientEpisode = \
                RowItemModel \
                (
                    {
                        "patient_id": pat_id,
                        "order_proc_id": order_proc_id,
                        "proc_code": proc_code,
                        "order_time": order_time,
                        "result_normal_count": normal_results
                    }
                )
            patientEpisodes.append(patientEpisode)
            row = cursor.fetchone()

        # Initialize patient data.
        lastPatientId = None
        colNames = None
        patientEpisodeByIndexTime = None

        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)

        # Populate patient data.
        tempColNames = \
            ["patient_id", "order_proc_id", "proc_code", "order_time",
                "result_normal_count"]
        for patientEpisode in patientEpisodes:
            patientId = patientEpisode["patient_id"]

            if lastPatientId is not None and lastPatientId != patientId:
                # New patient ID so start querying for patient specific data and
                # populating patient episode data.

                # Clinical Item (PerfItem300)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem300",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Clinical Item (PerfItem400)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem400",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Clinical Item (PerfItem500)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem500",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Lab Result (Foo)
                labResultTable = extractor.queryLabResults(["Foo"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Bar)
                labResultTable = extractor.queryLabResults(["Bar"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Baz)
                labResultTable = extractor.queryLabResults(["Baz"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Qux)
                labResultTable = extractor.queryLabResults(["Qux"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \
                    preTimeDelta, postTimeDelta))

                # Flowsheet (Perflow)
                # tempFile = StringIO()
                # labResultTable = extractor.queryFlowsheet(["Perflow"], [patientId], tempFile)
                # flowsheetByNameByPatientId = extractor.parseFlowsheetFile(\
                #     StringIO(tempFile.getvalue()))
                # tempColNames.extend(extractor.addFlowsheetFeatures_singlePatient(\
                #     patientEpisodeByIndexTime, flowsheetByNameByPatientId[patientId], \
                #     ["Perflow"], preTimeDelta, postTimeDelta, tempColNames))

                if colNames is None:
                    # First row, print header row
                    colNames = tempColNames
                    formatter.formatTuple(colNames)

                # Print out patient (episode) data (one row per episode)
                formatter.formatResultDicts(patientEpisodeByIndexTime.values(),
                                            colNames)

            if lastPatientId is None or lastPatientId != patientId:
                # Prepare to aggregate patient episode record per patient
                patientEpisodeByIndexTime = dict()

            patientEpisodeByIndexTime[
                patientEpisode["order_time"]] = patientEpisode
            lastPatientId = patientId
            outFile.flush()

        # Last Iteration
        patientId = lastPatientId
        # Clinical Item (PerfItem300)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem300",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Clinical Item (PerfItem400)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem400",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Clinical Item (PerfItem500)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem500",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Lab Result (Foo)
        labResultTable = extractor.queryLabResults(["Foo"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Bar)
        labResultTable = extractor.queryLabResults(["Bar"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Baz)
        labResultTable = extractor.queryLabResults(["Baz"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Qux)
        labResultTable = extractor.queryLabResults(["Qux"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \
            preTimeDelta, postTimeDelta))

        formatter.formatResultDicts(patientEpisodeByIndexTime.values(),
                                    colNames)

        # Close file.
        outFile.close()

        # Stop timer.
        extractorStop = time.time()

        # Compare results.
        factoryTime = factoryStop - factoryStart
        extractorTime = extractorStop - extractorStart
        self.assertTrue(extractorTime > factoryTime)

        # Clean up feature matrix files.
        try:
            os.remove("extractor.feature_matrix.tab.gz")
        except OSError:
            pass
        try:
            os.remove(self.factory.getMatrixFileName())
        except OSError:
            pass
예제 #19
0
def queryPatientEpisodes(outputFile, extractor):
    log.info(
        "Select patient admissions with provider category of Tt Pamf Med (Primary) or Tt Med Univ (Primary)"
    )

    conn = DBUtil.connection()
    cursor = conn.cursor()
    try:
        # # Clinical item category for admission diagnoses
        # # ADMIT_DX_CATEGORY_ID = 2;
        # admitDxCategoryId = DBUtil.execute("select clinical_item_category_id from clinical_item_category where description like '%%ADMIT_DX%%'", conn=conn)[0][0];

        # # Look for items indicating suspected infection / sepsis
        # ivAntibioticItemIds = loadIVAntibioticItemIds(extractor);
        # bloodCultureItemIds = loadBloodCultureItemIds(extractor);
        # respiratoryViralPanelItemIds = loadRespiratoryViralPanelItemIds(extractor);

        # # Merge IV antibiotics and blood cultures, respiratory panels as items that suggest sepsis is suspected
        # suspectSepsisItemIds = ivAntibioticItemIds.union(bloodCultureItemIds.union(respiratoryViralPanelItemIds));
        # suspectSepsisItemIdsStr = str.join(',', [str(itemId) for itemId in suspectSepsisItemIds]);   # Convert to comma-separated string via a str.join function on list contracture

        # # Look for primary surgery teams to exclude
        # excludeTeamCategory = "SurgerySpecialty";
        # excludeTreatmentTeams = list();
        # for row in extractor.loadMapData("TreatmentTeamGroups"):
        #     if row["team_category"] == excludeTeamCategory:
        #         excludeTreatmentTeams.append(row["treatment_team"]);
        # query = SQLQuery();
        # query.addSelect("clinical_item_id");
        # query.addFrom("clinical_item");
        # query.addWhereIn("description", excludeTreatmentTeams );
        # excludeTeamItemIds = set();
        # for row in DBUtil.execute(query, conn=conn):
        #     excludeTeamItemIds.add(row[0]);
        # excludeTeamItemIdsStr = str.join(',', [str(itemId) for itemId in excludeTeamItemIds]);   # Convert to comma-separated string via a str.join function on list contracture

        # First pass query to get the list of patients and emergency department presentation times
        cohortQuery = \
        """
        select adt1.pat_anon_id, adt1.pat_enc_csn_anon_id, adt1.shifted_transf_in_dt_tm as edAdmitTime, adt2.shifted_transf_out_dt_tm as dischargeTime
        from stride_adt as adt1, stride_adt as adt2
        where 
            adt1.pat_anon_id in
            (select patient_id from patient_item inner join clinical_item on patient_item.clinical_item_id = clinical_item.clinical_item_id where clinical_item.clinical_item_category_id = 161 AND clinical_item.description = '%s') 
        and adt1.pat_enc_csn_anon_id = adt2.pat_enc_csn_anon_id
        """ % ("Tt Pamf Med (Primary)")

        print(cohortQuery, file=sys.stderr)
        cursor.execute(cohortQuery)

        patientEpisodes = list()
        patientEpisodeById = dict()

        # Collect Build basic patient ID and
        #   ED presentation dates and Discharge date/time
        prog = ProgressDots()
        row = cursor.fetchone()
        while row is not None:
            (patientId, encounterId, edAdmitTime, dischargeTime) = row
            #patientId = int(patientId);
            patientEpisode = \
                RowItemModel \
                (   {   "patient_id":patientId,
                        "edAdmitTime":edAdmitTime,
                        "dischargeTime":dischargeTime,
                        "encounter_id":encounterId,
                        "payorTitle": None, # Default encounter data to null in case can't find it later
                        "bpSystolic": None,
                        "bpDiastolic": None,
                        "temperature": None,
                        "pulse": None,
                        "respirations": None,
                    }
                )
            patientEpisodes.append(patientEpisode)
            if patientEpisode["encounter_id"] not in patientEpisodeById:
                patientEpisodeById[
                    patientEpisode["encounter_id"]] = patientEpisode

            prog.update()
            row = cursor.fetchone()
        prog.printStatus()

        # Second query phase to link to encounter information (e.g., insurance, admitting vital signs)
        encounterIds = columnFromModelList(patientEpisodes, "encounter_id")
        query = SQLQuery()
        query.addSelect("pat_id")
        query.addSelect("pat_enc_csn_id")
        query.addSelect("title")
        query.addSelect("bp_systolic")
        query.addSelect("bp_diastolic")
        query.addSelect("temperature")
        query.addSelect("pulse")
        query.addSelect("respirations")
        query.addFrom("stride_patient_encounter")
        query.addWhereIn("pat_enc_csn_id", encounterIds)
        cursor.execute(str(query), query.params)
        row = cursor.fetchone()
        while row is not None:
            (patientId, encounterId, payorTitle, bpSystolic, bpDiastolic,
             temperature, pulse, respirations) = row
            if encounterId in patientEpisodeById:
                patientEpisode = patientEpisodeById[encounterId]
                if patientEpisode["payorTitle"] is None:
                    patientEpisode["payorTitle"] = set()
                    # Single encounters may have multiple payors to track
                patientEpisode["payorTitle"].add(payorTitle)
                patientEpisode["bpSystolic"] = bpSystolic
                patientEpisode["bpDiastolic"] = bpDiastolic
                patientEpisode["temperature"] = temperature
                patientEpisode["pulse"] = pulse
                patientEpisode["respirations"] = respirations
            row = cursor.fetchone()

        # Drop results as tab-delimited text output
        formatter = TextResultsFormatter(outputFile)
        formatter.formatResultDicts(patientEpisodes, addHeaderRow=True)

        return patientEpisodes
    finally:
        cursor.close()
        conn.close()
예제 #20
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> <outputFile>\n"+\
                    "   <inputFile>    Tab-delimited file of data\n"+\
                    "   <ouputFile>    Tab-delimited file with relational table of t-test p-values for each sub-group pair.  Specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-l",
            "--labelCols",
            dest="labelCols",
            help=
            "Comma-separated list of the column headers to label data rows as belonging to different subgroups"
        )
        parser.add_option(
            "-v",
            "--valueCols",
            dest="valueCols",
            help=
            "Comma-separated list of the column headers for data values want to calculate statistics for"
        )
        parser.add_option(
            "-m",
            "--matchCols",
            dest="matchCols",
            help=
            "Comma-separated list of the column headers to match groups on, like row identifiers.  If not exists, then do independent t-tests rather than paired."
        )
        parser.add_option(
            "-b",
            "--baseLabels",
            dest="baseLabels",
            help=
            "Comma-separated list of values that the labelCols should have to represent which base method to compare all other methods to as a reference (otherwise do a full n^2 cartesian product of all combinations)."
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 1:
            inputFile = stdOpen(args[0])
            outputFile = stdOpen(args[1], "w")

            labelCols = options.labelCols.split(",")
            valueCols = options.valueCols.split(",")
            matchCols = None
            if options.matchCols:
                matchCols = options.matchCols.split(",")
            baseLabels = None
            if options.baseLabels:
                baseLabels = options.baseLabels.split(",")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            summaryData = {
                "argv": argv
            }
            print >> outputFile, COMMENT_TAG, json.dumps(summaryData)

            # Tab-delimited output formatting
            formatter = TextResultsFormatter(outputFile)

            # Prep generator first, so will be able to extract out relevant header columns
            rowGenerator = self(inputFile, labelCols, valueCols, matchCols,
                                baseLabels)

            # Insert a mock record to get a header / label row
            colNames = self.resultHeaders(labelCols, valueCols, matchCols)
            formatter.formatResultDict(RowItemModel(colNames, colNames),
                                       colNames)

            # Stream the concatenated data rows to the output to avoid storing all in memory
            for outputDict in rowGenerator:
                formatter.formatResultDict(outputDict, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
예제 #21
0
def queryPatientEpisodes(outputFile, extractor):
    log.info("Select patient admissions with possible/probable sepsis within 24 hours of admission (long query >60 min?)...");

    conn = DBUtil.connection();
    cursor = conn.cursor();
    try:
        # Clinical item category for admission diagnoses
        # ADMIT_DX_CATEGORY_ID = 2;
        admitDxCategoryId = DBUtil.execute("select clinical_item_category_id from clinical_item_category where description like '%%ADMIT_DX%%'", conn=conn)[0][0];

        # Look for items indicating suspected infection / sepsis
        ivAntibioticItemIds = loadIVAntibioticItemIds(extractor);
        bloodCultureItemIds = loadBloodCultureItemIds(extractor);
        respiratoryViralPanelItemIds = loadRespiratoryViralPanelItemIds(extractor);

        # Merge IV antibiotics and blood cultures, respiratory panels as items that suggest sepsis is suspected
        suspectSepsisItemIds = ivAntibioticItemIds.union(bloodCultureItemIds.union(respiratoryViralPanelItemIds));
        suspectSepsisItemIdsStr = str.join(',', [str(itemId) for itemId in suspectSepsisItemIds]);   # Convert to comma-separated string via a str.join function on list contracture

        # Look for primary surgery teams to exclude
        excludeTeamCategory = "SurgerySpecialty";
        excludeTreatmentTeams = list();
        for row in extractor.loadMapData("TreatmentTeamGroups"):
            if row["team_category"] == excludeTeamCategory:
                excludeTreatmentTeams.append(row["treatment_team"]);
        query = SQLQuery();
        query.addSelect("clinical_item_id");
        query.addFrom("clinical_item");
        query.addWhereIn("description", excludeTreatmentTeams );
        excludeTeamItemIds = set();
        for row in DBUtil.execute(query, conn=conn):
            excludeTeamItemIds.add(row[0]);
        excludeTeamItemIdsStr = str.join(',', [str(itemId) for itemId in excludeTeamItemIds]);   # Convert to comma-separated string via a str.join function on list contracture

        # First pass query to get the list of patients and emergency department presentation times
        cohortQuery = \
        """
        --  Pick out date(s) when admitted through emergency department and matching discharge time
        select adt1.pat_anon_id, adt1.pat_enc_csn_anon_id, adt1.shifted_transf_in_dt_tm as edAdmitTime, adt2.shifted_transf_out_dt_tm as dischargeTime
        from stride_adt as adt1, stride_adt as adt2
        where 
            -- Admission event
            adt1.department_in = 'EMERGENCY DEPARTMENT' and
            adt1.event_in = 'Admission' and
            adt1.pat_anon_id in
            (    -- Select any patient with any suspected sepsis related order (i.e., IV antibiotics or blood cultures)
                select patient_id
                from patient_item as pi
                where pi.clinical_item_id in (%s)
                except
                -- Exclude any patient who has been on a primary surgery team
                select patient_id
                from patient_item
                where clinical_item_id in (%s)
                -- -12434586418575,-12432455207729,-12428492282572,-12428492282572,-12424048595257,-12414081679705
            ) and
            
            adt1.pat_enc_csn_anon_id = adt2.pat_enc_csn_anon_id and
            
            -- Discharge event
            adt2.event_out = 'Discharge'
            
        order by adt1.shifted_transf_in_dt_tm
        """ % (suspectSepsisItemIdsStr, excludeTeamItemIdsStr);
        print >> sys.stderr, cohortQuery;
        cursor.execute(cohortQuery);

        patientEpisodes = list();
        patientEpisodeById = dict();

        # Collect Build basic patient ID and 
        #   ED presentation dates and Discharge date/time
        prog = ProgressDots();
        row = cursor.fetchone();
        while row is not None:
            (patientId, encounterId, edAdmitTime, dischargeTime) = row;
            #patientId = int(patientId);
            patientEpisode = \
                RowItemModel \
                (   {   "patient_id":patientId, 
                        "edAdmitTime":edAdmitTime, 
                        "dischargeTime":dischargeTime, 
                        "encounter_id":encounterId,
                        "payorTitle": None, # Default encounter data to null in case can't find it later
                        "bpSystolic": None,
                        "bpDiastolic": None,
                        "temperature": None,
                        "pulse": None,
                        "respirations": None,
                    }
                );
            patientEpisodes.append(patientEpisode);
            if patientEpisode["encounter_id"] not in patientEpisodeById:
                patientEpisodeById[patientEpisode["encounter_id"]] = patientEpisode;

            prog.update();
            row = cursor.fetchone();
        prog.printStatus();

        # Second query phase to link to encounter information (e.g., insurance, admitting vital signs)
        encounterIds = columnFromModelList(patientEpisodes, "encounter_id");
        query = SQLQuery();
        query.addSelect("pat_id");
        query.addSelect("pat_enc_csn_id");
        query.addSelect("title");
        query.addSelect("bp_systolic");
        query.addSelect("bp_diastolic");
        query.addSelect("temperature");
        query.addSelect("pulse");
        query.addSelect("respirations");
        query.addFrom("stride_patient_encounter");
        query.addWhereIn("pat_enc_csn_id", encounterIds);
        cursor.execute(str(query), query.params);
        row = cursor.fetchone();
        while row is not None:
            (patientId, encounterId, payorTitle, bpSystolic, bpDiastolic, temperature, pulse, respirations) = row;
            if encounterId in patientEpisodeById:
                patientEpisode = patientEpisodeById[encounterId];
                if patientEpisode["payorTitle"] is None:
                    patientEpisode["payorTitle"] = set();   # Single encounters may have multiple payors to track
                patientEpisode["payorTitle"].add(payorTitle);
                patientEpisode["bpSystolic"] = bpSystolic;
                patientEpisode["bpDiastolic"] = bpDiastolic;
                patientEpisode["temperature"] = temperature;
                patientEpisode["pulse"] = pulse;
                patientEpisode["respirations"] = respirations;
            row = cursor.fetchone();
        
        # Drop results as tab-delimited text output
        formatter = TextResultsFormatter(outputFile);
        formatter.formatResultDicts(patientEpisodes, addHeaderRow=True);

        return patientEpisodes;
    finally:
        cursor.close();
        conn.close();
예제 #22
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <recommenderName> <patientIds> [<outputFile>]\n"+\
                    "   <patientIds/dataFile>    Name of file with patient ids.  If not found, then interpret as comma-separated list of test Patient IDs to prepare analysis data for.  Alternatively, provide preparedPatientItemFile generated from PreparePatientItems as input.\n"+\
                    "   <outputFile>    If query yields a result set, then that will be output\n"+\
                    "                       to the named file.  Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-c",
            "--baseCategoryId",
            dest="baseCategoryId",
            help=
            "ID of clinical item category to look for initial items from (probably the ADMIT Dx item)."
        )
        parser.add_option(
            "-Q",
            "--queryTimeSpan",
            dest="queryTimeSpan",
            help=
            "Time frame specified in seconds over which to look for initial query items (e.g., 24hrs = 86400) after the base item found from the category above.  Start the time counting from the first item time occuring after the category item above since the ADMIT Dx items are often keyed to dates only without times (defaulting to midnight of the date specified)."
        )
        parser.add_option(
            "-o",
            "--outcomeItemIds",
            dest="outcomeItemIds",
            help=
            "Comma separated list of outcome item IDs to get prediction / recommendation scores for, as well as to label whether they actually appeared for the given patients.  Can specify virtual items representing the end of item triples (e.g., 5-Readmission being the end of any item followed by 3591-Discharge then 3671-Admit), by adding the component items in expected sequence.  For example, '5=3591:3671'"
        )
        parser.add_option(
            "-t",
            "--timeDeltaMax",
            dest="timeDeltaMax",
            help=
            "Time delta in seconds maximum by which recommendations should be based on.  Defaults to recommending items that occur at ANY time after the key orders.  If provided, will apply limits to only orders placed within 0 seconds, 1 hour (3600), 1 day (86400), or 1 week (604800) of the key orders / items.  If set, will also only count presence of labeled target items if occurs within the given time delta of the first query item."
        )

        parser.add_option(
            "-P",
            "--preparedPatientItemFile",
            dest="preparedPatientItemFile",
            action="store_true",
            help=
            "If set, will expect primary argument to instead be name of file to read input data from, instead of using above parameters to query from database."
        )

        parser.add_option(
            "-R",
            "--recommender",
            dest="recommender",
            help=
            "Name of the recommender to run the analysis against.  Options: %s"
            % list(RECOMMENDER_CLASS_BY_NAME.keys()))
        parser.add_option(
            "-S",
            "--scoreField",
            dest="scoreField",
            help=
            "Name of (derived) field to score items by.  For example, 'conditionalFreq.'"
        )
        parser.add_option(
            "-p",
            "--countPrefix",
            dest="countPrefix",
            help=
            "Which counting method to use for item associations.  Defaults to counting item occurrences, allowing for duplicates.  Additional options include: %s."
            % list(COUNT_PREFIX_OPTIONS))
        parser.add_option(
            "-a",
            "--aggregationMethod",
            dest="aggregationMethod",
            help=
            "Aggregation method to use for recommendations based off multiple query items.  Options: %s."
            % list(AGGREGATOR_OPTIONS))
        parser.add_option(
            "-s",
            "--skipIfOutcomeInQuery",
            dest="skipIfOutcomeInQuery",
            action="store_true",
            help=
            "If set, will skip patients where the outcome item occurs during the query period since that would defy the point of predicting the outcome."
        )
        parser.add_option(
            "-m",
            "--maxRecommendedId",
            dest="maxRecommendedId",
            help=
            "Specify a maximum ID value to accept for recommended items.  More used to limit output in test cases"
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 0:
            # Parse out the query parameters
            query = AnalysisQuery()
            query.recommender = RECOMMENDER_CLASS_BY_NAME[
                options.recommender]()
            query.recommender.dataManager.dataCache = dict()
            # Use local cache to speed up repeat queries

            query.baseRecQuery = RecommenderQuery()
            if options.preparedPatientItemFile:
                # Don't reconstruct validation data through database, just read off validation file
                query.preparedPatientItemFile = stdOpen(args[0])
            else:

                patientIdsParam = args[0]
                try:
                    # Try to open patient IDs as a file
                    patientIdFile = stdOpen(patientIdsParam)
                    query.patientIds = set(patientIdFile.read().split())
                except IOError:
                    # Unable to open as a filename, then interpret as simple comma-separated list
                    query.patientIds = set(patientIdsParam.split(","))

                query.baseCategoryId = int(options.baseCategoryId)
                # Category to look for clinical item to start accruing query items from
                query.queryTimeSpan = timedelta(0, int(options.queryTimeSpan))

                query.baseRecQuery.targetItemIds = set()

                outcomeIdStrList = options.outcomeItemIds.split(",")
                for outcomeIdStr in outcomeIdStrList:
                    outcomeIdComponents = outcomeIdStr.split("=")
                    outcomeId = int(outcomeIdComponents[0])
                    query.baseRecQuery.targetItemIds.add(outcomeId)
                    if len(outcomeIdComponents) > 1:
                        sequenceIds = [
                            int(seqIdStr)
                            for seqIdStr in outcomeIdComponents[1].split(":")
                        ]
                        query.sequenceItemIdsByVirtualItemId[
                            outcomeId] = tuple(sequenceIds)

            if options.timeDeltaMax is not None:
                query.baseRecQuery.timeDeltaMax = timedelta(
                    0, int(options.timeDeltaMax))
            if options.scoreField is not None:
                query.baseRecQuery.sortField = options.scoreField
            if options.countPrefix is not None:
                query.baseRecQuery.countPrefix = options.countPrefix
            if options.aggregationMethod is not None:
                query.baseRecQuery.aggregationMethod = options.aggregationMethod
            if options.maxRecommendedId is not None:
                query.baseRecQuery.maxRecommendedId = int(
                    options.maxRecommendedId)

            if options.skipIfOutcomeInQuery is not None:
                query.skipIfOutcomeInQuery = options.skipIfOutcomeInQuery

            # Run the actual analysis
            analysisResults = self(query)

            # Format the results for output
            outputFilename = None
            if len(args) > 1:
                outputFilename = args[1]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with analysis arguments to allow for deconstruction later
            print(COMMENT_TAG, json.dumps({"argv": argv}), file=outputFile)

            colNames = self.analysisHeaders(query)
            analysisResults.insert(0, RowItemModel(colNames, colNames))
            # Insert a mock record to get a header / label row

            formatter = TextResultsFormatter(outputFile)
            formatter.formatResultDicts(analysisResults, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
예제 #23
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <patientIds/dataFile> [<outputFile>]\n"+\
                    "   <patientIds/dataFile>    Name of file with patient ids.  If not found, then interpret as comma-separated list of test Patient IDs to prepare analysis data for.  Alternatively, provide preparedPatientItemFile generated from PreparePatientItems as input.\n"+\
                    "   <outputFile>    If query yields a result set, then that will be output\n"+\
                    "                       to the named file.  Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-q",
            "--numQuery",
            dest="numQuery",
            help=
            "Number of orders / items from each patient to use as query items to prime the recommendations.  If set to a float number in (0,1), then treat as a percentage of the patient's total orders / items"
        )
        parser.add_option(
            "-v",
            "--numVerify",
            dest="numVerify",
            help=
            "Number of orders / items from each patient after the query items to use to validate recommendations.  If set to a float number in (0,1), then treat as a percentage of the patient's total orders / items.  If left unset, then just use all remaining orders / items for that patient"
        )
        parser.add_option(
            "-c",
            "--baseCategoryId",
            dest="baseCategoryId",
            help=
            "Instead of specifying first nQ query items, specify ID of clinical item category to look for initial items from (probably the ADMIT Dx item)."
        )
        parser.add_option(
            "-b",
            "--baseItemId",
            dest="baseItemId",
            help=
            "Instead of specifying first nQ query items, specify ID of the specific clinical item to look for initial items from."
        )
        parser.add_option(
            "-S",
            "--startDate",
            dest="startDate",
            help="Only look for test data occuring on or after this start date."
        )
        parser.add_option(
            "-E",
            "--endDate",
            dest="endDate",
            help="Only look for test data occuring before this end date.")
        parser.add_option(
            "-Q",
            "--queryTimeSpan",
            dest="queryTimeSpan",
            help=
            "Time frame specified in seconds over which to look for initial query items (e.g., 24hrs = 86400) after the base item found from the category above.  Start the time counting from the first item time occuring after the category item above since the ADMIT Dx items are often keyed to dates only without times (defaulting to midnight of the date specified)."
        )
        parser.add_option(
            "-V",
            "--verifyTimeSpan",
            dest="verifyTimeSpan",
            help=
            "Time frame specified in seconds over which to look for verify items after initial query item time.  Will ignore the query items that occur within the queryTimeSpan."
        )

        parser.add_option(
            "-P",
            "--preparedPatientItemFile",
            dest="preparedPatientItemFile",
            action="store_true",
            help=
            "If set, will expect primary argument to instead be name of file to read input data from, instead of using above parameters to query from database."
        )

        parser.add_option(
            "-R",
            "--recommender",
            dest="recommender",
            help=
            "Name of the recommender to run the analysis against.  Options: %s"
            % list(RECOMMENDER_CLASS_BY_NAME.keys()))
        parser.add_option(
            "-r",
            "--numRecs",
            dest="numRecs",
            help=
            "Number of orders / items to recommend for comparison against the verification set. Alternative set option numRecsByOrderSet to look for key order set usage and size."
        )
        parser.add_option(
            "-O",
            "--numRecsByOrderSet",
            dest="numRecsByOrderSet",
            action="store_true",
            help=
            "If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider."
        )
        parser.add_option(
            "-s",
            "--sortField",
            dest="sortField",
            help=
            "Allow overriding of default sort field when returning ranked results"
        )
        parser.add_option(
            "-f",
            "--fieldFilters",
            dest="fieldFilters",
            help=
            "Filters to exclude results.  Comma-separated separated list of field-op:value exclusions where op is either < or > like, conditionalFreq<:0.1,frqeRatio<:1"
        )
        parser.add_option(
            "-t",
            "--timeDeltaMax",
            dest="timeDeltaMax",
            help=
            "If set, represents a time delta in seconds maximum by which recommendations should be based on.  Defaults to recommending items that occur at ANY time after the key orders.  If provided, will apply limits to only orders placed within 0 seconds, 1 hour (3600), 1 day (86400), or 1 week (604800) of the key orders / items."
        )
        parser.add_option(
            "-a",
            "--aggregationMethod",
            dest="aggregationMethod",
            help=
            "Aggregation method to use for recommendations based off multiple query items.  Options: %s."
            % list(AGGREGATOR_OPTIONS))
        parser.add_option(
            "-p",
            "--countPrefix",
            dest="countPrefix",
            help=
            "Prefix for how to do counts.  Blank for default item counting allowing repeats, otherwise ignore repeats for patient_ or encounter_"
        )
        parser.add_option(
            "-m",
            "--maxRecommendedId",
            dest="maxRecommendedId",
            help=
            "Specify a maximum ID value to accept for recommended items.  More used to limit output in test cases"
        )

        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) >= 1:
            # Parse out the query parameters
            query = AnalysisQuery()
            query.recommender = RECOMMENDER_CLASS_BY_NAME[
                options.recommender]()
            query.recommender.dataManager.dataCache = dict()
            # Use a dataCache to facilitate repeat queries

            if options.preparedPatientItemFile:
                # Don't reconstruct validation data through database, just read off validation file
                query.preparedPatientItemFile = stdOpen(args[0])
            else:
                patientIdsParam = args[0]
                try:
                    # Try to open patient IDs as a file
                    patientIdFile = stdOpen(patientIdsParam)
                    query.patientIds = set(patientIdFile.read().split())
                except IOError:
                    # Unable to open as a filename, then interpret as simple comma-separated list
                    query.patientIds = set(patientIdsParam.split(","))

                if options.numQuery is not None:
                    query.numQueryItems = int(options.numQuery)
                    query.numVerifyItems = int(options.numVerify)
                else:
                    # Alternative to specify query time span starting from a key category
                    query.queryTimeSpan = timedelta(0,
                                                    int(options.queryTimeSpan))
                    query.verifyTimeSpan = timedelta(
                        0, int(options.verifyTimeSpan))

                if options.baseCategoryId is not None or options.baseItemId is not None:
                    if options.baseCategoryId is not None:
                        query.baseCategoryId = int(options.baseCategoryId)
                        # Category to look for clinical item to start accruing query items from
                    if options.baseItemId is not None:
                        query.baseItemId = int(options.baseItemId)

                if options.startDate is not None:
                    query.startDate = DBUtil.parseDateValue(options.startDate)
                if options.endDate is not None:
                    query.endDate = DBUtil.parseDateValue(options.endDate)

            query.baseRecQuery = RecommenderQuery()
            query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds(
            )
            query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds(
            )
            if options.timeDeltaMax is not None and len(
                    options.timeDeltaMax) > 0:
                query.baseRecQuery.timeDeltaMax = timedelta(
                    0, int(options.timeDeltaMax))
            if options.aggregationMethod is not None:
                query.baseRecQuery.aggregationMethod = options.aggregationMethod
            if options.countPrefix is not None:
                query.baseRecQuery.countPrefix = options.countPrefix
            if options.maxRecommendedId is not None:
                query.baseRecQuery.maxRecommendedId = int(
                    options.maxRecommendedId)
            if options.sortField is not None:
                query.baseRecQuery.sortField = options.sortField
            if options.fieldFilters is not None:
                for fieldFilterStr in options.fieldFilters.split(","):
                    (fieldOp, valueStr) = fieldFilterStr.split(":")
                    query.baseRecQuery.fieldFilters[fieldOp] = float(valueStr)

            if options.numRecs is not None:
                query.numRecommendations = int(options.numRecs)
            else:
                # No recommendation count specified, then just use the same as the verify number
                query.numRecommendations = query.numVerifyItems
            query.numRecsByOrderSet = options.numRecsByOrderSet

            # Run the actual analysis
            analysisResults = self(query)

            # Format the results for output
            outputFilename = None
            if len(args) > 1:
                outputFilename = args[1]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with analysis arguments to allow for deconstruction later
            summaryData = {
                "argv": argv
            }
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile)

            formatter = TextResultsFormatter(outputFile)
            colNames = self.resultHeaders(query)
            formatter.formatTuple(colNames)
            # Insert a mock record to get a header / label row
            formatter.formatResultDicts(analysisResults, colNames)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
예제 #24
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> [<outputFile>]\n"+\
                    "   <inputFile>    Validation file in prepared result file format.  Predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\
                    "   <outputFile>   Validation result stat summaries.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-X",
            "--excludeCategoryIds",
            dest="excludeCategoryIds",
            help=
            "For recommendation, exclude / skip any items who fall under one of these comma-separated category Ids."
        )
        parser.add_option(
            "-s",
            "--sortField",
            dest="sortField",
            default=DEFAULT_SORT_FIELD,
            help=
            "Score field to sort top recommendations by.  Default to posterior probabilty / positive predictive value 'P(B|A)', but can also select 'lift' = 'tfidf' = 'interest' for TF*IDF style score weighting."
        )
        parser.add_option(
            "-r",
            "--numRecs",
            dest="numRecs",
            default=DEFAULT_RECOMMENDED_ITEM_COUNT,
            help=
            "Number of orders / items to recommend for comparison against the verification set."
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) >= 1:
            query = AnalysisQuery()
            query.preparedPatientItemFile = stdOpen(args[0])
            query.recommender = OrderSetRecommender()
            query.baseRecQuery = RecommenderQuery()
            if options.excludeCategoryIds is not None:
                query.baseRecQuery.excludeCategoryIds = set()
                for categoryIdStr in options.executeCategoryIds.split(","):
                    query.baseRecQuery.excludeCategoryIds.add(
                        int(categoryIdStr))
            else:  # Default exclusions if none specified
                query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds(
                )
                query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds(
                )

            query.baseRecQuery.sortField = options.sortField
            query.numRecommendations = int(options.numRecs)

            # Run the actual analysis
            analysisResults = self(query)

            # Format the results for output
            outputFilename = None
            if len(args) > 1:
                outputFilename = args[1]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with analysis arguments to allow for deconstruction later
            summaryData = {
                "argv": argv
            }
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile)

            formatter = TextResultsFormatter(outputFile)
            colNames = self.resultHeaders(query)
            formatter.formatTuple(colNames)
            # Insert a mock record to get a header / label row
            formatter.formatResultDicts(analysisResults, colNames)
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
예제 #25
0
    "Charlson.HemiplegiaParaplegia.pre", "Charlson.LiverMild.pre",
    "Charlson.LiverModSevere.pre", "Charlson.Malignancy.pre",
    "Charlson.MalignancyMetastatic.pre", "Charlson.MI.pre",
    "Charlson.PepticUlcer.pre", "Charlson.PeripheralVascular.pre",
    "Charlson.Renal.pre", "Charlson.Rheumatic.pre", "self_pay", "PO2A.last",
    "Pulse.last", "NA.last", "CR.last", "HCT.last", "WBC.last", "BUN.last",
    "TBIL.last", "K.last", "Resp.last", "Temp.last", "Urine.last",
    "BP_Low_Diastolic.last", "BP_High_Systolic.last",
    "Glasgow.Coma.Scale.Score.last", "TT.Cardiology.pre", "TT.CCU.HF.pre",
    "TT.CCU.pre", "TT.HemeOnc.pre", "TT.Medicine.pre", "TT.MICU.pre",
    "TT.Neurology.pre", "TT.SICU.pre", "TT.SurgerySpecialty.pre",
    "TT.Transplant.pre", "TT.Trauma.pre", "self_pay"
]

ofs = stdOpen("simulatedData.ICUDNR.tab", "w")
formatter = TextResultsFormatter(ofs)
formatter.formatTuple(colNames)
# Header row

random.seed(987654321)
# Consistent seed for reproducibility
nPatients = 10000

# Random generator parameters
ageRange = [30, 80]
incomeRange = [20000, 200000]
incomeStep = 1000
femaleRate = 0.5

# Ranges on uniform distribution to assign race labels. Leave ~50% empty for default White race
raceRangesByLabel = \
#import mechanize
#import cookielib
from BeautifulSoup import BeautifulSoup
import urllib

from medinfo.common.Util import ProgressDots, stdOpen
from medinfo.db.ResultsFormatter import TextResultsFormatter

BASE_FILENAME = 'buprenorphinePhysicians.%s.htm'
N_PAGES = 641
#N_PAGES = 5

OUTPUT_FILENAME = 'prescribers.suboxone.tab'

ofs = stdOpen(OUTPUT_FILENAME, "w")
formatter = TextResultsFormatter(ofs)

colNames = list()
allColsSeen = False

progress = ProgressDots(big=100, small=2)
for iPage in xrange(N_PAGES):
    localFilename = BASE_FILENAME % (iPage)
    localFile = open(localFilename)
    html = localFile.read()
    localFile.close()

    soup = BeautifulSoup(html)
    cells = soup("td")

    currRow = list()
예제 #27
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <inputFile> [<outputFile>]\n"+\
                    "   <inputFile>    Validation file in prepared result file format.  Predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\
                    "   <outputFile>   Validation result stat summaries.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-r",
            "--numRecs",
            dest="numRecs",
            default=DEFAULT_RECOMMENDED_ITEM_COUNT,
            help=
            "Number of orders / items to recommend for comparison against the verification set, sorted in prevalence order.  If skip or set <1, then will use all order set items found."
        )
        parser.add_option(
            "-O",
            "--numRecsByOrderSet",
            dest="numRecsByOrderSet",
            action="store_true",
            help=
            "If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider."
        )
        parser.add_option(
            "-s",
            "--sortField",
            dest="sortField",
            default=DEFAULT_SORT_FIELD,
            help=
            "Allow overriding of default sort field when returning ranked results (patient_count, name, description, etc.)"
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) >= 1:
            query = AnalysisQuery()
            query.preparedPatientItemFile = stdOpen(args[0])
            query.recommender = OrderSetRecommender()
            query.baseRecQuery = RecommenderQuery()
            # Default exclusions if none specified
            query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds(
            )
            query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds(
            )
            query.baseRecQuery.sortField = options.sortField
            query.numRecommendations = int(options.numRecs)
            query.numRecsByOrderSet = options.numRecsByOrderSet

            # Run the actual analysis
            analysisResults = self(query)

            # Format the results for output
            outputFilename = None
            if len(args) > 1:
                outputFilename = args[1]
            outputFile = stdOpen(outputFilename, "w")

            # Print comment line with analysis arguments to allow for deconstruction later
            summaryData = {
                "argv": argv
            }
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile)

            formatter = TextResultsFormatter(outputFile)
            colNames = self.resultHeaders(query)
            formatter.formatTuple(colNames)
            # Insert a mock record to get a header / label row
            formatter.formatResultDicts(analysisResults, colNames)
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
예제 #28
0
"""Given 2D Table of values, spit out "melted" long-relational form to feed into antibiogramData.js"""

import sys, os
from medinfo.common.Const import NULL_STRING
from medinfo.common.Util import stdOpen
from medinfo.db.ResultsFormatter import TabDictReader, TextResultsFormatter

ifs = stdOpen(sys.argv[1])
# Input tab delimited file
ofs = stdOpen(sys.argv[2], "w")
# "-" for stdout

reader = TabDictReader(ifs)
formatter = TextResultsFormatter(ofs)
for row in reader:
    bug = row["Bug"]
    for key in reader.fieldnames:
        value = row[key]
        if key != "Bug" and value and value != NULL_STRING:
            formatter.formatTuple([value, bug, key])
예제 #29
0
def main(argv=None):
    timer = time.time()

    extractor = DataExtractor()

    # Output file
    featureMatrixFile = stdOpen("featureMatrix.SepsisICU.encounters.tab.gz",
                                "w")

    # Final columns to output to patient matrix
    colNames = list()

    patientEpisodes = extractor.parsePatientEpisodeFile(
        stdOpen("patientEpisodes.tab"), colNames)
    #patientIds = set(columnFromModelList(patientEpisodes, "patient_id"));

    log.info("Expand to index dates based start and end dates")
    # But only want one entry per patient
    patientByIndexTimeById = extractor.generateDateRangeIndexTimes(
        "edAdmitTime",
        "dischargeTime",
        patientEpisodes,
        colNames,
        timeInterval=None)

    log.info("Populate flowsheet summary statistics")
    flowsheetByNameByPatientId = extractor.parseFlowsheetFile(
        stdOpen("Flowsheet.tab.gz"))
    extractor.addFlowsheetFeatures(patientByIndexTimeById,
                                   flowsheetByNameByPatientId, FLOWSHEET_NAMES,
                                   FLOWSHEET_PRE_TIME_DELTA,
                                   FLOWSHEET_POST_TIME_DELTA, colNames)

    log.info("Populate laboratory result summary statistics")
    labsByBaseNameByPatientId = extractor.parseLabResultsFile(
        stdOpen("LabResults.tab.gz"))
    extractor.addLabFeatures(patientByIndexTimeById, labsByBaseNameByPatientId,
                             LAB_BASE_NAMES, LAB_PRE_TIME_DELTA,
                             LAB_POST_TIME_DELTA, colNames)

    log.info("Populate IV Fluid accumulation")
    ivFluidsByPatientId = extractor.parseIVFluidFile(
        stdOpen("IsotonicIVFluids.tab.gz"))
    extractor.addIVFluidFeatures(patientByIndexTimeById, ivFluidsByPatientId,
                                 IVF_THRESHOLD_VOLUMES, IVF_CHECKPOINT_TIMES,
                                 colNames)

    log.info(
        "Record presence of items in terms of relative time to each item from index time"
    )
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("IVAntibiotic.tab")),
        patientByIndexTimeById, colNames, "IVAntibiotic")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("BloodCulture.tab")),
        patientByIndexTimeById, colNames, "BloodCulture")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RespViralPanel.tab")),
        patientByIndexTimeById, colNames, "RespViralPanel")

    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyICULifeSupport.tab")),
        patientByIndexTimeById, colNames, "AnyICULifeSupport")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyDNR.tab")),
        patientByIndexTimeById, colNames, "AnyDNR")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVasoactive.tab")),
        patientByIndexTimeById, colNames, "AnyVasoactive")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyCRRT.tab")),
        patientByIndexTimeById, colNames, "AnyCRRT")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVentilator.tab")),
        patientByIndexTimeById, colNames, "AnyVentilator")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("ComfortCare.tab")),
        patientByIndexTimeById, colNames, "ComfortCare")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("PalliativeConsult.tab")),
        patientByIndexTimeById, colNames, "PalliativeConsult")

    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Death.tab")),
        patientByIndexTimeById, colNames, "Death")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Birth.tab")),
        patientByIndexTimeById, colNames, "Birth")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Male.tab")),
        patientByIndexTimeById, colNames, "Male")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Female.tab")),
        patientByIndexTimeById, colNames, "Female")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteNonHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteNonHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceAsian.tab")),
        patientByIndexTimeById, colNames, "RaceAsian")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceHispanicLatino.tab")),
        patientByIndexTimeById, colNames, "RaceHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceUnknown.tab")),
        patientByIndexTimeById, colNames, "RaceUnknown")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceOther.tab")),
        patientByIndexTimeById, colNames, "RaceOther")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceBlack.tab")),
        patientByIndexTimeById, colNames, "RaceBlack")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RacePacificIslander.tab")),
        patientByIndexTimeById, colNames, "RacePacificIslander")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceNativeAmerican.tab")),
        patientByIndexTimeById, colNames, "RaceNativeAmerican")

    log.info(
        "Systemically Scan for Charlson comorbidities and Treatment Team categories"
    )
    for filename in os.listdir("."):
        if filename.startswith(CHARLSON_PREFIX):
            diseaseName = filename
            if filename.endswith(".tab"):
                diseaseName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, diseaseName)

        if filename.startswith(TREATMENT_TEAM_PREFIX):
            teamName = filename
            if filename.endswith(".tab"):
                teamName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, teamName)

    log.info("Output feature matrix file with row per patient day")
    formatter = TextResultsFormatter(featureMatrixFile)
    formatter.formatTuple(colNames)
    for patientId, patientByIndexTime in patientByIndexTimeById.iteritems():
        patientResults = patientByIndexTime.values()
        formatter.formatResultDicts(patientResults, colNames)

    timer = time.time() - timer
    print >> sys.stderr, "%.3f seconds to complete" % timer
예제 #30
0
파일: formatData.py 프로젝트: xxxx3/CDSS
def main(argv=None):
    timer = time.time()

    extractor = DataExtractor()

    # Final columns to output to patient matrix
    colNames = list()

    patientById = extractor.parsePatientFile(stdOpen("patients.tab"), colNames)

    log.info("Expand to index dates based start and end dates")
    patientByIndexTimeById = extractor.generateDateRangeIndexTimes(
        "firstLifeSupportDate", "lastContiguousDate",
        list(patientById.values()), colNames)

    log.info("Populate flowsheet summary statistics")
    flowsheetByNameByPatientId = extractor.parseFlowsheetFile(
        stdOpen("Flowsheet.tab.gz"))
    extractor.addFlowsheetFeatures(patientByIndexTimeById,
                                   flowsheetByNameByPatientId, FLOWSHEET_NAMES,
                                   FLOWSHEET_PRE_TIME_DELTA,
                                   FLOWSHEET_POST_TIME_DELTA, colNames)

    log.info("Populate laboratory result summary statistics")
    labsByBaseNameByPatientId = extractor.parseLabResultsFile(
        stdOpen("LabResults.tab.gz"))
    extractor.addLabFeatures(patientByIndexTimeById, labsByBaseNameByPatientId,
                             LAB_BASE_NAMES, LAB_PRE_TIME_DELTA,
                             LAB_POST_TIME_DELTA, colNames)

    log.info(
        "Record presence of items in terms of relative time to each item from index time"
    )
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyICULifeSupport.tab")),
        patientByIndexTimeById, colNames, "AnyICULifeSupport")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyDNR.tab")),
        patientByIndexTimeById, colNames, "AnyDNR")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVasoactive.tab")),
        patientByIndexTimeById, colNames, "AnyVasoactive")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyCRRT.tab")),
        patientByIndexTimeById, colNames, "AnyCRRT")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("AnyVentilator.tab")),
        patientByIndexTimeById, colNames, "AnyVentilator")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("ComfortCare.tab")),
        patientByIndexTimeById, colNames, "ComfortCare")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("PalliativeConsult.tab")),
        patientByIndexTimeById, colNames, "PalliativeConsult")

    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Death.tab")),
        patientByIndexTimeById, colNames, "Death")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Birth.tab")),
        patientByIndexTimeById, colNames, "Birth")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Male.tab")),
        patientByIndexTimeById, colNames, "Male")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("Female.tab")),
        patientByIndexTimeById, colNames, "Female")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteNonHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteNonHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceAsian.tab")),
        patientByIndexTimeById, colNames, "RaceAsian")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(
            stdOpen("RaceWhiteHispanicLatino.tab")), patientByIndexTimeById,
        colNames, "RaceWhiteHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceHispanicLatino.tab")),
        patientByIndexTimeById, colNames, "RaceHispanicLatino")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceUnknown.tab")),
        patientByIndexTimeById, colNames, "RaceUnknown")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceOther.tab")),
        patientByIndexTimeById, colNames, "RaceOther")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceBlack.tab")),
        patientByIndexTimeById, colNames, "RaceBlack")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RacePacificIslander.tab")),
        patientByIndexTimeById, colNames, "RacePacificIslander")
    extractor.addClinicalItemFeatures(
        extractor.parseClinicalItemFile(stdOpen("RaceNativeAmerican.tab")),
        patientByIndexTimeById, colNames, "RaceNativeAmerican")

    log.info(
        "Systemically Scan for Charlson comorbidities and Treatment Team categories"
    )
    for filename in os.listdir("."):
        if filename.startswith(CHARLSON_PREFIX):
            diseaseName = filename
            if filename.endswith(".tab"):
                diseaseName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, diseaseName)

        if filename.startswith(TREATMENT_TEAM_PREFIX):
            teamName = filename
            if filename.endswith(".tab"):
                teamName = filename[:-len(".tab")]
            extractor.addClinicalItemFeatures(
                extractor.parseClinicalItemFile(stdOpen(filename)),
                patientByIndexTimeById, colNames, teamName)

    log.info("Output feature matrix file with row per patient day")
    featureMatrixFile = stdOpen("featureMatrix.ICUDNR.tab.gz", "w")
    formatter = TextResultsFormatter(featureMatrixFile)
    for patientId, patientByIndexTime in patientByIndexTimeById.items():
        patientResults = list(patientByIndexTime.values())
        formatter.formatResultDicts(patientResults,
                                    colNames,
                                    addHeaderRow=True)

    timer = time.time() - timer
    print("%.3f seconds to complete" % timer, file=sys.stderr)