예제 #1
0
def queryLabResults(outputFile, patientById):
    log.info("Query out lab results, takes a while")
    labBaseNames = \
    (   'ferritin','fe','trfrn','trfsat','ystfrr',
        'wbc','hgb','hct','mcv','rdw','plt',
        'retic','reticab','ldh','hapto','tbil','ibil','dbil',
        'cr','esr','crp'
    )

    formatter = TextResultsFormatter(outputFile)

    # Query rapid when filter by lab result type, limited to X records.
    # Filtering by patient ID drags down substantially until preloaded table by doing a count on the SOR table?
    colNames = [
        "pat_id", "base_name", "common_name", "ord_num_value",
        "reference_unit", "result_flag", "sor.result_time"
    ]

    query = SQLQuery()
    for col in colNames:
        query.addSelect(col)
    query.addFrom("stride_order_results as sor, stride_order_proc as sop")
    query.addWhere("sor.order_proc_id = sop.order_proc_id")
    query.addWhereIn("base_name", labBaseNames)
    query.addWhereIn("pat_id", patientById.viewkeys())
    query.addOrderBy("pat_id")
    query.addOrderBy("sor.result_time")

    DBUtil.execute(query, includeColumnNames=True, formatter=formatter)
    def _analyze_predictors_on_holdout(self):
        fm_io = FeatureMatrixIO()

        algorithms_to_test = list()
        algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS)

        pipeline_file_name = inspect.getfile(inspect.currentframe())
        data_dir = SupervisedLearningPipeline._fetch_data_dir_path(
            self, pipeline_file_name)
        # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS:
        #     algorithms_to_test.append('bifurcated-%s' % algorithm)
        log.debug('algorithms_to_test: %s' % algorithms_to_test)
        for algorithm in algorithms_to_test:
            log.info('analyzing %s...' % algorithm)
            # If report_dir does not exist, make it.
            report_dir = '/'.join([data_dir, algorithm])

            pipeline_prefix = '%s-normality-prediction-%s' % (self._var,
                                                              algorithm)

            predictor_path = self._build_model_dump_path(algorithm)

            if os.path.exists(
                    predictor_path) and 'bifurcated' not in algorithm:
                log.debug('Loading model from disk...')
                # TODO(sbala): Fix loblib.load so that it works for bifurcated
                # supervised classifiers.
                self._predictor = joblib.load(predictor_path)
                # self._features = self._X_train.columns
                status = SupervisedClassifier.TRAINED

            SupervisedLearningPipeline._analyze_predictor_holdoutset(
                self, report_dir, pipeline_prefix)
예제 #3
0
def prepare_database(raw_data_files, raw_data_folderpath, db_name, fold_enlarge_data=1, USE_CACHED_DB=False):

    if os.path.exists(os.path.join(raw_data_folderpath, db_name)):
        if USE_CACHED_DB:
            log.info(db_name + " already exists!")
            return
        else:
            os.remove(os.path.join(raw_data_folderpath, db_name))

    if fold_enlarge_data != 1:
        large_data_folderpath = raw_data_folderpath + '/' + 'enlarged_data_by_%s_fold'%str(fold_enlarge_data)

        if not os.path.exists(large_data_folderpath):
            os.mkdir(large_data_folderpath)

        large_data_files = [x.replace('sample','large') for x in raw_data_files]
        # Same file names, different folders
        utils_UMich.create_large_files(raw_data_files,raw_data_folderpath,
                                       large_data_files,large_data_folderpath,
                                                     num_repeats=fold_enlarge_data, USE_CACHED_DB=USE_CACHED_DB)
        data_files = large_data_files
        data_folderpath = large_data_folderpath
    else:
        data_files = raw_data_files
        data_folderpath = raw_data_folderpath

    for data_file in data_files:
        if 'encounters' in data_file:
            all_included_order_proc_ids = utils_UMich.raw2db(data_file, data_folderpath, db_path=raw_data_folderpath, db_name=db_name, build_index_patid=True)
        elif 'labs' in data_file:
            utils_UMich.raw2db(data_file, data_folderpath, db_path=raw_data_folderpath, db_name=db_name,
                               build_index_patid=True, collected_included_order_proc_ids=all_included_order_proc_ids)
        else:
            utils_UMich.raw2db(data_file, data_folderpath, db_path=raw_data_folderpath, db_name=db_name,
                               build_index_patid=True)
예제 #4
0
def parseLabResultsFile(labFile):
    log.info("Parse lab results file");
    prog = ProgressDots();
    labsByBaseNameByPatientId = dict(); # Dictionary of dictionaries of lists of result items
    for labResult in TabDictReader(labFile):
        if labResult["ord_num_value"] is not None and labResult["ord_num_value"] != NULL_STRING:
            patientId = int(labResult["pat_id"]);
            labBaseName = labResult["base_name"];
            resultValue = float(labResult["ord_num_value"]);
            resultTime = DBUtil.parseDateValue(labResult["result_time"]);

            if resultValue < LAB_SENTINEL_VALUE:    # Skip apparent placeholder values
                labResult["pat_id"] = labResult["patient_id"] = patientId;
                labResult["ord_num_value"] = resultValue;
                labResult["result_time"] = resultTime;

                if patientId not in labsByBaseNameByPatientId:
                    labsByBaseNameByPatientId[patientId] = dict();
                if labBaseName not in labsByBaseNameByPatientId[patientId]:
                    labsByBaseNameByPatientId[patientId][labBaseName] = list();
                labsByBaseNameByPatientId[patientId][labBaseName].append( labResult );

        prog.update();
    prog.printStatus();
    return labsByBaseNameByPatientId;
예제 #5
0
def queryOutpatientIronRx(outputFile, patientById):
    log.info("Query outpatient Iron prescriptions")

    # Medication IDs derived by mapping through Iron as an ingredient
    poIronIngredientMedicationIds = (3065, 3066, 3067, 3071, 3074, 3077, 3986,
                                     7292, 11050, 25006, 26797, 34528, 39676,
                                     78552, 79674, 83568, 84170, 85151, 96118,
                                     112120, 112395, 113213, 126035, 198511,
                                     200455, 201994, 201995, 203679, 207059,
                                     207404, 208037, 208072)
    # Medication IDs directly from prescriptions, formulations that did not map through RxNorm
    poIronDirectMedicationIds = (111354, 540526, 205010, 121171, 111320, 82791,
                                 93962, 201795, 206722, 201068, 116045, 208725,
                                 111341, 206637, 112400, 210256, 77529, 20844,
                                 83798, 205523, 112428, 125474, 111343)
    allEnteralIronMedicationIds = set(poIronIngredientMedicationIds).union(
        poIronDirectMedicationIds)

    formatter = TextResultsFormatter(outputFile)

    colNames = ["pat_id", "ordering_date"]

    query = SQLQuery()
    for col in colNames:
        query.addSelect(col)
    query.addFrom("stride_order_med")
    query.addWhereIn("medication_id", allEnteralIronMedicationIds)
    query.addWhereIn("pat_id", patientById.viewkeys())
    query.addOrderBy("pat_id")
    query.addOrderBy("ordering_date")

    DBUtil.execute(query, includeColumnNames=True, formatter=formatter)
예제 #6
0
    def _get_random_patient_list(self):
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Get average number of results for this lab test per patient.
        avg_orders_per_patient = self._get_average_orders_per_patient()
        log.info('avg_orders_per_patient: %s' % avg_orders_per_patient)
        # Based on average # of results, figure out how many patients we'd
        # need to get for a feature matrix of requested size.
        self._num_patients = int(numpy.max([self._num_requested_episodes / \
            avg_orders_per_patient, 1]))

        # Get numPatientsToQuery random patients who have gotten test.
        # TODO(sbala): Have option to feed in a seed for the randomness.
        query = SQLQuery()
        query.addSelect('pat_id')
        query.addFrom('stride_order_proc AS sop')
        query.addWhereIn('proc_code', [self._lab_panel])
        query.addOrderBy('RANDOM()')
        query.setLimit(self._num_patients)
        log.debug('Querying random patient list...')
        results = DBUtil.execute(query)

        # Get patient list.
        random_patient_list = [ row[0] for row in results ]

        return random_patient_list
예제 #7
0
def create_large_files(raw_data_files, raw_data_folderpath,
                       large_data_files, large_data_folderpath, num_repeats=100, USE_CACHED_DB=True):
    import os
    if os.path.exists(large_data_folderpath + '/' + large_data_files[0]):
        if USE_CACHED_DB:
            log.info("Large files exist!")
            return
        else:
            for large_data_file in large_data_files:
                os.remove(large_data_folderpath + '/' + large_data_file)

    if 'labs' not in raw_data_files[0]:
        quit("Please place labs file as the beginning of raw_data_files!")  # TODO: quit(int)?

    # query all_pat_ids from raw_data. Presumably small files, so should not be problem
    with open(raw_data_folderpath + '/' + raw_data_files[0]) as f:
        lines_lab = f.readlines()
        f.close()
    all_pat_ids = set(
        [line_str2list(line)[1] for line in lines_lab[1:]])  # set([line.split('|')[1] for line in lines[1:]])

    # Each time, perturb pat_ids in a specific random way, and modify all tables accordingly...
    for _ in range(num_repeats):
        # Create a different perturbation rule each time
        my_dict = {}
        for pat_id in all_pat_ids:
            my_dict[pat_id] = perturb_str(pat_id)

        # For each perturbation, perturb all tables
        for ind in range(len(raw_data_files)):
            raw_file_path = raw_data_folderpath + '/' + raw_data_files[ind]
            target_file_path = large_data_folderpath + '/' + large_data_files[ind]
            perturb_a_file(raw_file_path, target_file_path, col_patid=1, my_dict=my_dict)
예제 #8
0
    def _add_features(self):
        # Add lab panel order features.
        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':
            self._factory.addClinicalItemFeatures([self._lab_var],
                                                  features="pre",
                                                  isLabPanel=self._isLabPanel)
        else:
            # TODO: naming
            self._factory.addClinicalItemFeatures_UMich(
                [self._lab_var],
                features="pre",
                clinicalItemType=self._varTypeInTable,
                clinicalItemTime='order_time',
                tableName='labs')  #sx

        # Add lab component result features, for a variety of time deltas.
        LAB_PRE_TIME_DELTAS = [datetime.timedelta(-14)]
        LAB_POST_TIME_DELTA = datetime.timedelta(0)
        log.info('Adding lab component features...')
        for pre_time_delta in LAB_PRE_TIME_DELTAS:
            log.info('\t%s' % pre_time_delta)
            self._factory.addLabResultFeatures(self._lab_components, False,
                                               pre_time_delta,
                                               LAB_POST_TIME_DELTA)

        FeatureMatrix._add_features(self, index_time_col='order_time')
예제 #9
0
파일: extractData.py 프로젝트: xxxx3/CDSS
def queryPatients(period, locations, rxCount):
    log.info(
        "Select patients fitting criteria in designated time period: (%s,%s)" %
        period)

    query = SQLQuery()
    query.addSelect("med.pat_id")
    query.addSelect("count(order_med_id)")
    query.addFrom("stride_mapped_meds as map")
    query.addFrom("stride_order_med as med")
    query.addFrom("stride_patient as pat")
    query.addWhere("analysis_status = 1")
    query.addWhere("map.medication_id = med.medication_id")
    query.addWhere("med.pat_id = pat.pat_id")
    query.addWhere("possible_oncology = 0")
    query.addWhereIn("patient_location", locations)
    query.addWhereOp("ordering_datetime", ">", period[0])
    query.addWhereOp("ordering_datetime", "<", period[-1])
    query.addGroupBy("med.pat_id")
    query.addHaving("count(order_med_id) >2")

    results = DBUtil.execute(query)
    cols = ["patientId", "nOpioidRx"]
    patientDF = pd.DataFrame(results, columns=cols)
    #patientDF.set_index("patientId",drop=False,inplace=True);

    patientDF["periodStart"] = period[0]
    # Identify this group of patient records

    return patientDF
예제 #10
0
    def test_train_and_predict(self):
        # Load data set.
        X = DataFrame(RANDOM_CLASSIFICATION_TEST_CASE['X'],
                      columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10'])
        y = DataFrame(RANDOM_CLASSIFICATION_TEST_CASE['y'])
        random_state = RANDOM_CLASSIFICATION_TEST_CASE['random_state']
        expected_y_pred_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['y_predicted']
        expected_str_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['str']
        expected_hyperparams_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['hyperparams']
        expected_params_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['params']
        expected_descriptions_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['description']

        # Generate train/test split.
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

        # Iterate through SUPPORTED_ALGORITHMS.
        for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS:
            log.info('Testing %s classifier...' % algorithm)
            # Train model.
            hyperparams = {'algorithm': algorithm, 'random_state': random_state}
            # Default to stochastic search for expensive algorithms.
            if algorithm in [SupervisedClassifier.RANDOM_FOREST]:
                hyperparams['hyperparam_strategy'] = SupervisedClassifier.STOCHASTIC_SEARCH
                # Test ability to force hyperparam values.
                hyperparams['max_depth'] = 2
                hyperparams['n_estimators'] = 5
                hyperparams['min_samples_leaf'] = 1
                hyperparams['min_samples_split'] = 0.2
            else:
                hyperparams['hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH
            classifier = SupervisedClassifier([0, 1], hyperparams)
            classifier.train(X_train, y_train)

            # Test str().
            expected_str = expected_str_by_algorithm[algorithm]
            actual_str = str(classifier)
            self.assertEqual(expected_str, actual_str)

            # Test hyperparameters.
            expected_hyperparams = expected_hyperparams_by_algorithm[algorithm]
            actual_hyperparams = classifier.hyperparams()
            self._assert_equal_hyperparams(expected_hyperparams, actual_hyperparams)

            # Test model parameters.
            expected_params = expected_params_by_algorithm[algorithm]
            actual_params = classifier.params()
            self.assertEqualDict(expected_params, actual_params)

            # Test model description.
            expected_description = expected_descriptions_by_algorithm[algorithm]
            actual_description = classifier.description()
            self.assertEqual(expected_description, actual_description)

            # Test prediction values.
            expected_y_pred = expected_y_pred_by_algorithm[algorithm]
            log.debug('expected_y_pred: %s' % expected_y_pred)
            actual_y_pred = classifier.predict(X_test)
            log.debug('actual_y_pred: %s' % actual_y_pred)
            self.assertEqualList(expected_y_pred, actual_y_pred)
예제 #11
0
def addLabFeatures(labsByBaseNameByPatientId, patientById, colNames, indexItemBaseName, labBaseNames, labPreTime, labPostTime):
    log.info("Sort lab results by result time for each patient and find items within specified time period to aggregate");
    prog = ProgressDots();
    for iPatient, (patientId, labsByBaseName) in enumerate(labsByBaseNameByPatientId.iteritems()):
        # Look for the first result of the index item (ferritin)
        indexItem = None;
        if indexItemBaseName in labsByBaseName:
            for labResult in labsByBaseName[indexItemBaseName]:
                if indexItem is None or labResult["result_time"] < indexItem["result_time"]:
                    indexItem = labResult;

        if indexItem is not None:   # Skip this patient if no index item found, should not be possible since pre-screened for relevant patients
            indexTime = indexItem["result_time"];

            patient = patientById[patientId];
            patient["index_time"] = indexTime;

            preTimeLimit = indexTime+labPreTime;
            postTimeLimit = indexTime+labPostTime;

            # Init values for each lab of interest to an empty list
            for labBaseName in labBaseNames:
                # Default to null for all values
                patient["%s.min" % labBaseName] = None;
                patient["%s.max" % labBaseName] = None;
                patient["%s.median" % labBaseName] = None;
                patient["%s.mean" % labBaseName] = None;
                patient["%s.std" % labBaseName] = None;
                patient["%s.first" % labBaseName] = None;
                patient["%s.last" % labBaseName] = None;
                patient["%s.proximate" % labBaseName] = None;
                
                proximateValue = None;
                if labBaseName in labsByBaseName:   # Not all patients will have all labs checked
                    proximateItem = None;   # Item closest to the index item in time
                    valueList = list();
                    for labResult in labsByBaseName[labBaseName]:
                        resultTime = labResult["result_time"];
                        if preTimeLimit <= resultTime and resultTime < postTimeLimit:
                            # Occurs within time frame of interest, so record this value
                            valueList.append(labResult["ord_num_value"]);
                            
                        if proximateItem is None or (abs(resultTime-indexTime) < abs(proximateItem["result_time"]-indexTime)):
                            proximateItem = labResult;
                    proximateValue = proximateItem["ord_num_value"];
                
                    if len(valueList) > 0:
                        patient["%s.min" % labBaseName] = np.min(valueList);
                        patient["%s.max" % labBaseName] = np.max(valueList);
                        patient["%s.median" % labBaseName] = np.median(valueList);
                        patient["%s.mean" % labBaseName] = np.mean(valueList);
                        patient["%s.std" % labBaseName] = np.std(valueList);
                        patient["%s.first" % labBaseName] = valueList[0];   # Assumes previously sorted 
                        patient["%s.last" % labBaseName] = valueList[-1];   #   by result_time
                    patient["%s.proximate" % labBaseName] = proximateValue;

                prog.update();
    colNames.extend(colsFromLabBaseNames(labBaseNames));
    prog.printStatus();
예제 #12
0
파일: StrideLoader.py 프로젝트: xxxx3/CDSS
 def clear_stride_psql_tables():
     log.info('Clearing stride psql tables...')
     for params in list(STRIDE_LOADER_PARAMS.values()):
         psql_table = params['psql_table'] % TABLE_PREFIX
         log.debug('dropping table %s...' % psql_table)
         # load_stride_to_psql is not itempotent, so in case schema already
         # existed, clear table (avoid duplicate data).
         DBUtil.execute("DROP TABLE IF EXISTS %s CASCADE;" % psql_table)
예제 #13
0
 def define_virtual_clinical_items():
     log.info('Defining virtual clinical items...')
     # READMISSION
     # Readmission defined as a discharge --> admission.
     # First, define a clinical item for Readmission.
     admission_cic_id_query = \
         """
         SELECT clinical_item_category_id
         FROM clinical_item_category
         WHERE description = 'Admission';
         """
     results = DBUtil.execute(admission_cic_id_query)
     admission_cic_id = results[0][0]
     readmission_definition_command = \
         """
         INSERT INTO clinical_item (
             clinical_item_category_id,
             name,
             description,
             analysis_status
         )
         VALUES (
             %s,
             'READT',
             'Readmission',
             1
         );
         """ % admission_cic_id
     DBUtil.execute(readmission_definition_command)
     # Second, get clinical_item_id for admission, discharge, & readmission.
     admission_ci_id_query = \
         """
         SELECT clinical_item_id
         FROM clinical_item
         WHERE name = 'AnyAdmit';
         """
     results = DBUtil.execute(admission_ci_id_query)
     admission_ci_id = results[0][0]
     discharge_ci_id_query = \
         """
         SELECT clinical_item_id
         FROM clinical_item
         WHERE name = 'ADT12';
         """
     results = DBUtil.execute(discharge_ci_id_query)
     discharge_ci_id = results[0][0]
     readmission_ci_id_query = \
         """
         SELECT clinical_item_id
         FROM clinical_item
         WHERE name = 'READT';
         """
     results = DBUtil.execute(readmission_ci_id_query)
     readmission_ci_id = results[0][0]
     # Third, use TripleAssociationAnalysis to build a new virtual item.
     item_sequence = [discharge_ci_id, admission_ci_id]
     ClinicalItemDataLoader.build_virtual_clinical_item(
         item_sequence, readmission_ci_id)
예제 #14
0
    def raw2db(self,
               data_file,
               data_folderpath,
               db_path,
               db_name,
               build_index_patid=True):
        chunk_size = 100000  # num of rows

        print('Now writing %s into database...' % data_file)  #

        generated_tables = []

        with open(data_folderpath + '/' + data_file) as f:
            is_first_chunk = True
            while True:
                # a chunk of rows
                next_n_lines_str = list(islice(f, chunk_size))
                if not next_n_lines_str:
                    break

                if is_first_chunk:
                    colnames = utils_NonSTRIDE.line_str2list(
                        next_n_lines_str[0], test_mode=False)
                    data_df = self.lines2pd(next_n_lines_str[1:], colnames)
                    #, params_str2list)
                    is_first_chunk = False
                else:  ## make each chunk into pandas
                    data_df = self.lines2pd(next_n_lines_str, colnames)
                    # params_str2list)

                ## append each pandas to db tables
                if LocalEnv.DATASET_SOURCE_NAME == 'UMich':
                    df_name = data_file.replace(".txt", "")
                    df_name = df_name.replace(".sample", "")
                    df_name = df_name.replace(".test", "")
                    df_name = df_name.replace(".large", "")
                    df_name = df_name.replace('.', '_')  # pt.info
                elif LocalEnv.DATASET_SOURCE_NAME == 'UCSF':
                    df_name = data_file.replace(".tsv", "")  #
                    df_name = df_name.replace("_deident", "")
                    df_name = df_name.replace('.', '_')

                generated_tables += self.pd2db(data_df,
                                               df_name=df_name,
                                               db_path=db_path,
                                               db_name=db_name)
                ##

        if build_index_patid:
            conn = sqlite3.connect(db_path + '/' + db_name)

            for generated_table in generated_tables:
                build_index_query = "CREATE INDEX IF NOT EXISTS index_for_%s ON %s (%s);" % (
                    generated_table, generated_table, 'pat_id')
                log.info(build_index_query)
                # print build_index_query
                conn.execute(build_index_query)
예제 #15
0
파일: formatData.py 프로젝트: xxxx3/CDSS
def main(argv=None):
    timer = time.time()

    # Final columns to output to patient matrix
    colNames = list()

    patientById = parsePatientFile(stdOpen("patients.tab"), colNames)

    labsByBaseNameByPatientId = parseLabResultsFile(stdOpen("labs.tab"))
    addLabFeatures(labsByBaseNameByPatientId, patientById, colNames,
                   INDEX_ITEM_BASE_NAME, LAB_BASE_NAMES, LAB_PRE_TIME,
                   LAB_POST_TIME)

    log.info(
        "Record presence of items in terms of relative time to each item from index time"
    )
    itemTimesByPatientId = parseClinicalItemFile(stdOpen("admitDx.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ICD9.208-AdmitDx")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("problemListDx.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ICD9.208-ProblemListDx")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("feSO4Rx.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironSO4")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("allEnteralIron.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironEnteral")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("ironIV.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironIV")

    itemTimesByPatientId = parseClinicalItemFile(
        stdOpen("outpatientIronRx.tab"),
        patientIdCol="pat_id",
        timeCol="ordering_date")
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "ironOutpatient")

    itemTimesByPatientId = parseClinicalItemFile(stdOpen("transfusions.tab"))
    addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames,
                            "RBCTransfusion")

    patientResults = filterPatients(patientById)

    log.info("Output feature matrix file with row per patient")
    featureMatrixFile = stdOpen("featureMatrix.lab14to1day.tab", "w")
    formatter = TextResultsFormatter(featureMatrixFile)
    formatter.formatResultDicts(patientResults, colNames, addHeaderRow=True)

    timer = time.time() - timer
    print("%.3f seconds to complete" % timer, file=sys.stderr)
예제 #16
0
def filterPatients(patientById):
    log.info("Deidentify patient IDs and build data list with adequate data");
    patientResults = list();
    for iPatient, patient in enumerate(patientById.itervalues()):
        # Further deidentify patients by applying sequential ID
        patient["pat_id"] = patient["patient_id"] = iPatient;
        # Only accept patients where an index item and times were found
        if "index_time" in patient:
            patientResults.append(patient);
    return patientResults;
예제 #17
0
def parsePatientFile(patientFile, colNames):
    log.info("Parse patient file");
    patientFile = stdOpen("patients.tab");
    patientById = dict();
    for patient in TabDictReader(patientFile):
        patientId = int(patient["patient_id"]);
        patient["patient_id"] = patientId;
        patientById[patientId] = patient;

    colNames.extend(["patient_id","dialysis","surgery"]);
    return patientById;    
예제 #18
0
    def _tune_hyperparams_regress_and_round(self, X, y):
        self._hyperparams['hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH
        log.info('Tuning hyperparams via %s...' % self._hyperparams['hyperparam_strategy'])
        # If not provided, search for best coef_max.
        if self._hyperparams.get('coef_max') is None:
            self._hyperparams['coef_max'] = self._tune_coef_max(X, y)

        # Round linear coefficients.
        self._round_coefs(self._hyperparams['coef_max'])
        log.debug('hyperparams: %s' % self.hyperparams())
        log.debug('params: %s' % self.params())
예제 #19
0
    def _add_features(self):
        # Add lab panel order features.
        self._factory.addClinicalItemFeatures([self._lab_panel], features="pre")

        # Add lab component result features, for a variety of time deltas.
        LAB_PRE_TIME_DELTAS = [datetime.timedelta(-14)]
        LAB_POST_TIME_DELTA = datetime.timedelta(0)
        log.info('Adding lab component features...')
        for pre_time_delta in LAB_PRE_TIME_DELTAS:
            log.info('\t%s' % pre_time_delta)
            self._factory.addLabResultFeatures(self._lab_components, False, pre_time_delta, LAB_POST_TIME_DELTA)

        FeatureMatrix._add_features(self, index_time_col='order_time')
예제 #20
0
    def train(self, X, y, groups=None):

        self._groups = groups
        assert ('pat_id' not in X.columns)

        self._features = X.columns

        y = self._maybe_reshape_y(y)

        # Verify that there are at least 2 samples of each class.
        value_counts = Series(y).value_counts()
        log.debug('y.value_counts(): %s' % value_counts)
        for class_label in self._classes:
            # If there aren't 2+ samples of each class, exit gracefully.
            try:
                num_samples = value_counts[class_label]
                if num_samples < 10:
                    log.error('Insufficient samples (%s) of label %s.' %
                              (num_samples, class_label))
                    return SupervisedClassifier.INSUFFICIENT_SAMPLES
            except KeyError:
                log.error('Insufficient samples (0) of label %s.' %
                          class_label)
                return SupervisedClassifier.INSUFFICIENT_SAMPLES

        log.info('Training %s classifier...' % self._hyperparams['algorithm'])
        if self._hyperparams[
                'algorithm'] == SupervisedClassifier.DECISION_TREE:
            self._train_decision_tree(X, y)
        elif self._hyperparams[
                'algorithm'] == SupervisedClassifier.LOGISTIC_REGRESSION:
            self._train_logistic_regression(X, y)
        elif self._hyperparams[
                'algorithm'] == SupervisedClassifier.RANDOM_FOREST:
            self._train_random_forest(X, y)
        elif self._hyperparams[
                'algorithm'] == SupervisedClassifier.REGRESS_AND_ROUND:
            self._train_regress_and_round(X, y)
        elif self._hyperparams['algorithm'] == SupervisedClassifier.ADABOOST:
            self._train_adaboost(X, y)
        elif self._hyperparams[
                'algorithm'] == SupervisedClassifier.GAUSSIAN_NAIVE_BAYES:
            self._train_gaussian_naive_bayes(X, y)
        elif self._hyperparams['algorithm'] == SupervisedClassifier.SVM:
            self._train_svm(X, y)
        elif self._hyperparams['algorithm'] == SupervisedClassifier.XGB:
            self._train_xgb(X, y)
        elif self._hyperparams['algorithm'] == SupervisedClassifier.NN:
            self._train_nn(X, y)

        return SupervisedClassifier.TRAINED
예제 #21
0
def main(argv):
    """Main method, callable from command line"""
    usage_str = "usage: %prog [options] <inputJsonDataFolder> <outputFile>\n" \
                "   <outputFile> CSV file with the usage report."

    parser = OptionParser(usage=usage_str)
    parser.add_option(
        "-g",
        "--graders",
        dest="graders",
        help="Comma-separated list of graders to use for grading")
    parser.add_option(
        "-s",
        "--survey",
        dest="survey_file",
        help="Path to a survey CSV file (used for adding 'resident' column)")

    (options, args) = parser.parse_args(argv[1:])

    log.info("Starting: " + str.join(" ", argv))
    timer = time.time()
    summary_data = {"argv": argv}

    grader_ids = set()
    if not options.graders:  # graders is a mandatory parameter
        print("No graders given. Cannot grade patient cases. Exiting.\n")
        parser.print_help()
        sys.exit()
    else:
        grader_ids.update(options.graders.split(VALUE_DELIM))

    survey_file = None
    if options.survey_file:
        survey_file = options.survey_file

    if len(args) < 2:  # we need input and output files given
        print("Given parameters are not enough. Exiting.\n")
        parser.print_help()
        sys.exit()

    input_folder = args[0]
    output_filename = args[1]

    # Print comment line with arguments to allow for deconstruction later as well as extra results
    print(COMMENT_TAG, json.dumps(summary_data))

    make_usage_report(input_folder, grader_ids, output_filename, survey_file)

    timer = time.time() - timer
    log.info("%.3f seconds to complete", timer)
예제 #22
0
def queryDemographics(patientDF, baseDate):
    log.info("Populate demographics background for %d patients" % len(patientDF) );
    
    query = SQLQuery();
    query.addSelect("pat_id");
    query.addSelect("%d-birth_year as age" % baseDate.year );
    query.addSelect("gender");
    query.addSelect("primary_race");
    query.addFrom("stride_patient");
    query.addWhereIn("pat_id", patientDF["patientId"] );
    
    results = DBUtil.execute(query);
    cols = ["patientId","age","gender","race"];
    newDF = pd.DataFrame(results,columns=cols);
    return patientDF.merge(newDF, how="left");
예제 #23
0
    def _tune_hyperparams(self, hyperparam_search_space, X, y):
        log.info('Tuning hyperparameters via %s...' %
                 self._hyperparams['hyperparam_strategy'])
        log.debug('hyperparam_search_space: %s' % str(hyperparam_search_space))
        # Log the pre-tuning score.
        self._get_or_set_hyperparam('cv', y)
        log.debug('initial hyperparams: %s' % self._hyperparams)
        pre_tuning_score = np.mean(cross_val_score(self._model, X, y, \
                                    cv=self._hyperparams['cv'], \
                                    groups=self._groups, \
                                    scoring=self._hyperparams['scoring'], \
                                    n_jobs=self._hyperparams['n_jobs']))

        # Initialize hyperparam tuner.
        # Assume the model was initialized before this function.
        if self._hyperparams[
                'hyperparam_strategy'] == SupervisedClassifier.EXHAUSTIVE_SEARCH:
            tuner = GridSearchCV(self._model, hyperparam_search_space, \
                                    scoring=self._hyperparams['scoring'], \
                                    n_jobs=self._hyperparams['n_jobs'], \
                                    iid=False, \
                                    refit=True, \
                                    cv=self._hyperparams['cv'], \
                                    return_train_score=False)
        elif self._hyperparams[
                'hyperparam_strategy'] == SupervisedClassifier.STOCHASTIC_SEARCH:
            self._get_or_set_hyperparam('n_iter')
            tuner = RandomizedSearchCV(self._model, hyperparam_search_space, \
                                        scoring=self._hyperparams['scoring'], \
                                        n_iter=self._hyperparams['n_iter'], \
                                        n_jobs=self._hyperparams['n_jobs'], \
                                        iid=False, \
                                        refit=True, \
                                        cv=self._hyperparams['cv'], \
                                        random_state=self._hyperparams['random_state'], \
                                        return_train_score=False)
        tuner.fit(X, y, groups=self._groups)

        # Set model and hyperparams.
        self._model = tuner.best_estimator_
        for key in tuner.best_params_.keys():
            log.debug('tune(%s): %s --> %s' % (key, self._hyperparams[key], \
                        tuner.best_params_[key]))
            self._hyperparams[key] = tuner.best_params_[key]
        log.debug('tune(%s): %s --> %s' % (self._hyperparams['scoring'],\
                    pre_tuning_score, tuner.best_score_))
        log.debug('hyperparams: %s' % self._hyperparams)
        log.debug('params: %s' % self.params())
예제 #24
0
def queryClinicalItems(outputFile, clinicalItemIds, patientById):
    log.info("Query Clinical Items: %s" % str(clinicalItemIds))
    formatter = TextResultsFormatter(outputFile)

    colNames = ["patient_id", "item_date"]

    query = SQLQuery()
    for col in colNames:
        query.addSelect(col)
    query.addFrom("patient_item")
    query.addWhereIn("clinical_item_id", clinicalItemIds)
    query.addWhereIn("patient_id", patientById.viewkeys())
    query.addOrderBy("patient_id")
    query.addOrderBy("item_date")

    DBUtil.execute(query, includeColumnNames=True, formatter=formatter)
예제 #25
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "Query for the clinical_item records that exist with the specified criteria\n"+\
                    "usage: %prog [options] [<outputFile>]\n"+\
                    "   <outputFile>    Results file. Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-i",
            "--itemPrefix",
            dest="itemPrefix",
            help=
            "Look for clinical_items whose description starts with this prefix."
        )
        parser.add_option(
            "-c",
            "--categoryNames",
            dest="categoryNames",
            help=
            "Comma separated list of clinical_item_category.descriptions to look for."
        )
        parser.add_option(
            "-p",
            "--pauseSeconds",
            dest="pauseSeconds",
            default="0",
            help="Number of seconds to pause between processing each record.")
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()
        if len(args) > 0:
            outputFile = stdOpen(args[0], "w")

            # Print comment line with arguments to allow for deconstruction later as well as extra results
            summaryData = {
                "argv": argv
            }
            print(COMMENT_TAG, json.dumps(summaryData), file=outputFile)

            self.queryItems(options, outputFile)

        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
예제 #26
0
파일: StrideLoader.py 프로젝트: xxxx3/CDSS
    def load_stride_to_psql():
        # Build clean data files.
        StrideLoader.build_clean_csv_files()

        # Build psql schemata.
        StrideLoader.build_stride_psql_schemata()

        # Build paths to clean data files.
        clean_data_dir = StrideLoader.fetch_clean_data_dir()
        for raw_file in sorted(STRIDE_LOADER_PARAMS.keys()):
            params = STRIDE_LOADER_PARAMS[raw_file]

            # Build clean data file.
            clean_file = params['clean_file'] % TABLE_PREFIX
            log.info('loading %s...' % clean_file)
            clean_path = os.path.join(clean_data_dir, clean_file)

            # Uncompress data file.
            unzipped_clean_path = clean_path[:-3]
            with gzip.open(clean_path,
                           'rb') as f_in, open(unzipped_clean_path,
                                               'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

            # psql COPY data from clean files into DB.
            psql_table = params['psql_table'] % TABLE_PREFIX
            log.debug('stride/data/clean/%s ==> %s' % (clean_file, psql_table))
            # In some cases, two files going to the same table will have
            # non-identical column names. Pass these explicitly so that
            # psql knows which columns to try to fill from file.
            # Strip the newline character.
            with open(unzipped_clean_path, 'r') as f_in:
                columns = f_in.readline()[:-1]
            command = "COPY %s (%s) FROM '%s' WITH (FORMAT csv, HEADER);" % (
                psql_table, columns, unzipped_clean_path)
            DBUtil.execute(command)

            # Delete unzipped_clean_path.
            os.remove(unzipped_clean_path)

        # Run any one-off postprocessing transformations which all users
        # of the STRIDE database should receive. Defer any application-specific
        # transformations to other modules.
        StrideLoader.process_stride_psql_db()

        # Build indices.
        StrideLoader.build_stride_psql_indices()
예제 #27
0
    def _get_random_patient_list(self):
        #sx: this function is for avoid RANDOM() on the database
        cursor = self._connection.cursor()

        # Get average number of results for this lab test per patient.
        query = SQLQuery()
        query.addSelect('pat_id')
        query.addSelect('COUNT(sop.order_proc_id) AS num_orders')
        query.addFrom('stride_order_proc AS sop')
        query.addFrom('stride_order_results AS sor')
        query.addWhere('sop.order_proc_id = sor.order_proc_id')
        ##
        query.addWhereIn("base_name", [self._component])
        query.addGroupBy('pat_id')
        log.debug('Querying median orders per patient...')

        results = DBUtil.execute(query)

        order_counts = [ row[1] for row in results ]


        if len(results) == 0:
            error_msg = '0 orders for component "%s."' % self._component #sx
            log.critical(error_msg)
            sys.exit('[ERROR] %s' % error_msg)
        else:
            avg_orders_per_patient = numpy.median(order_counts)
            log.info('avg_orders_per_patient: %s' % avg_orders_per_patient)
            # Based on average # of results, figure out how many patients we'd
            # need to get for a feature matrix of requested size.
            self._num_patients = int(numpy.max([self._num_requested_episodes / \
                avg_orders_per_patient, 1]))
            # Some components may have fewer associated patients than the required sample size
            patient_number_chosen = min([len(results),self._num_patients]) #
            inds_random_patients = numpy.random.choice(len(results), size=patient_number_chosen, replace=False)
            # print 'inds_random_patients:', inds_random_patients
            pat_IDs_random_patients = []
            for ind in inds_random_patients:
                pat_IDs_random_patients.append(results[ind][0])
            # print pat_IDs_random_patients
            return pat_IDs_random_patients
예제 #28
0
    def _add_features(self):
        # Add past susceptibility readings
        self._add_susc_features()

        # Add past antibiotic use as features
        self._add_med_features()

        # Add lab panel order features.
        self._factory.addClinicalItemFeatures(self._lab_panel, features="pre")

        # Add lab component result features, for a variety of time deltas.
        LAB_PRE_TIME_DELTAS = [datetime.timedelta(-14)]
        LAB_POST_TIME_DELTA = datetime.timedelta(0)
        log.info('Adding lab component features...')
        for pre_time_delta in LAB_PRE_TIME_DELTAS:
            log.info('\t%s' % pre_time_delta)
            self._factory.addLabResultFeatures(self._lab_components, False,
                                               pre_time_delta,
                                               LAB_POST_TIME_DELTA)

        FeatureMatrix._add_features(self, index_time_col='shifted_order_time')
예제 #29
0
def queryDrugScreens( patientDF, period, locations ):
    log.info("Populate drug screens by primary locations");

    query = SQLQuery();
    query.addSelect("pat_id");
    query.addSelect("count(distinct order_proc_id)");
    query.addFrom("stride_order_proc_drug_screen");
    query.addWhere("ordering_mode = 'Outpatient'");
    query.addWhereIn("patient_location", locations );
    query.addWhereOp("ordering_date",">", period[0]);
    query.addWhereOp("ordering_date","<", period[-1]);
    query.addWhereIn("pat_id", patientDF["patientId"] );
    query.addGroupBy("pat_id");

    results = DBUtil.execute(query);
    cols = ["patientId","nDrugScreens"];
    newDF = pd.DataFrame(results,columns=cols);
    patientDF = patientDF.merge(newDF, how="left");
    patientDF["nDrugScreens"][np.isnan(patientDF["nDrugScreens"])] = 0;    # Populate default values if no data
    patientDF["nDrugScreens"] = patientDF["nDrugScreens"].astype("int");    # Beware of float conversion somewhere
    return patientDF;
예제 #30
0
def main_quickTest(argv):
    modelFilename = argv[1]
    modeler = TopicModel()

    timer = time.time()
    (model, docCountByWordId) = modeler.loadModelAndDocCounts(modelFilename)
    timer = time.time() - timer
    log.info("%.2f seconds to load", timer)

    timer = time.time()
    weightByItemIdByTopicId = modeler.generateWeightByItemIdByTopicId(
        model, 100)
    timer = time.time() - timer
    log.info("%.2f seconds to generate weights", timer)

    for i in xrange(3):
        prog = ProgressDots()
        for (topicId, weightByItemId) in weightByItemIdByTopicId.iteritems():
            for (itemId, itemWeight) in weightByItemId.iteritems():
                prog.update()
        prog.printStatus()
    """