def queryLabResults(outputFile, patientById): log.info("Query out lab results, takes a while") labBaseNames = \ ( 'ferritin','fe','trfrn','trfsat','ystfrr', 'wbc','hgb','hct','mcv','rdw','plt', 'retic','reticab','ldh','hapto','tbil','ibil','dbil', 'cr','esr','crp' ) formatter = TextResultsFormatter(outputFile) # Query rapid when filter by lab result type, limited to X records. # Filtering by patient ID drags down substantially until preloaded table by doing a count on the SOR table? colNames = [ "pat_id", "base_name", "common_name", "ord_num_value", "reference_unit", "result_flag", "sor.result_time" ] query = SQLQuery() for col in colNames: query.addSelect(col) query.addFrom("stride_order_results as sor, stride_order_proc as sop") query.addWhere("sor.order_proc_id = sop.order_proc_id") query.addWhereIn("base_name", labBaseNames) query.addWhereIn("pat_id", patientById.viewkeys()) query.addOrderBy("pat_id") query.addOrderBy("sor.result_time") DBUtil.execute(query, includeColumnNames=True, formatter=formatter)
def _analyze_predictors_on_holdout(self): fm_io = FeatureMatrixIO() algorithms_to_test = list() algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS) pipeline_file_name = inspect.getfile(inspect.currentframe()) data_dir = SupervisedLearningPipeline._fetch_data_dir_path( self, pipeline_file_name) # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS: # algorithms_to_test.append('bifurcated-%s' % algorithm) log.debug('algorithms_to_test: %s' % algorithms_to_test) for algorithm in algorithms_to_test: log.info('analyzing %s...' % algorithm) # If report_dir does not exist, make it. report_dir = '/'.join([data_dir, algorithm]) pipeline_prefix = '%s-normality-prediction-%s' % (self._var, algorithm) predictor_path = self._build_model_dump_path(algorithm) if os.path.exists( predictor_path) and 'bifurcated' not in algorithm: log.debug('Loading model from disk...') # TODO(sbala): Fix loblib.load so that it works for bifurcated # supervised classifiers. self._predictor = joblib.load(predictor_path) # self._features = self._X_train.columns status = SupervisedClassifier.TRAINED SupervisedLearningPipeline._analyze_predictor_holdoutset( self, report_dir, pipeline_prefix)
def prepare_database(raw_data_files, raw_data_folderpath, db_name, fold_enlarge_data=1, USE_CACHED_DB=False): if os.path.exists(os.path.join(raw_data_folderpath, db_name)): if USE_CACHED_DB: log.info(db_name + " already exists!") return else: os.remove(os.path.join(raw_data_folderpath, db_name)) if fold_enlarge_data != 1: large_data_folderpath = raw_data_folderpath + '/' + 'enlarged_data_by_%s_fold'%str(fold_enlarge_data) if not os.path.exists(large_data_folderpath): os.mkdir(large_data_folderpath) large_data_files = [x.replace('sample','large') for x in raw_data_files] # Same file names, different folders utils_UMich.create_large_files(raw_data_files,raw_data_folderpath, large_data_files,large_data_folderpath, num_repeats=fold_enlarge_data, USE_CACHED_DB=USE_CACHED_DB) data_files = large_data_files data_folderpath = large_data_folderpath else: data_files = raw_data_files data_folderpath = raw_data_folderpath for data_file in data_files: if 'encounters' in data_file: all_included_order_proc_ids = utils_UMich.raw2db(data_file, data_folderpath, db_path=raw_data_folderpath, db_name=db_name, build_index_patid=True) elif 'labs' in data_file: utils_UMich.raw2db(data_file, data_folderpath, db_path=raw_data_folderpath, db_name=db_name, build_index_patid=True, collected_included_order_proc_ids=all_included_order_proc_ids) else: utils_UMich.raw2db(data_file, data_folderpath, db_path=raw_data_folderpath, db_name=db_name, build_index_patid=True)
def parseLabResultsFile(labFile): log.info("Parse lab results file"); prog = ProgressDots(); labsByBaseNameByPatientId = dict(); # Dictionary of dictionaries of lists of result items for labResult in TabDictReader(labFile): if labResult["ord_num_value"] is not None and labResult["ord_num_value"] != NULL_STRING: patientId = int(labResult["pat_id"]); labBaseName = labResult["base_name"]; resultValue = float(labResult["ord_num_value"]); resultTime = DBUtil.parseDateValue(labResult["result_time"]); if resultValue < LAB_SENTINEL_VALUE: # Skip apparent placeholder values labResult["pat_id"] = labResult["patient_id"] = patientId; labResult["ord_num_value"] = resultValue; labResult["result_time"] = resultTime; if patientId not in labsByBaseNameByPatientId: labsByBaseNameByPatientId[patientId] = dict(); if labBaseName not in labsByBaseNameByPatientId[patientId]: labsByBaseNameByPatientId[patientId][labBaseName] = list(); labsByBaseNameByPatientId[patientId][labBaseName].append( labResult ); prog.update(); prog.printStatus(); return labsByBaseNameByPatientId;
def queryOutpatientIronRx(outputFile, patientById): log.info("Query outpatient Iron prescriptions") # Medication IDs derived by mapping through Iron as an ingredient poIronIngredientMedicationIds = (3065, 3066, 3067, 3071, 3074, 3077, 3986, 7292, 11050, 25006, 26797, 34528, 39676, 78552, 79674, 83568, 84170, 85151, 96118, 112120, 112395, 113213, 126035, 198511, 200455, 201994, 201995, 203679, 207059, 207404, 208037, 208072) # Medication IDs directly from prescriptions, formulations that did not map through RxNorm poIronDirectMedicationIds = (111354, 540526, 205010, 121171, 111320, 82791, 93962, 201795, 206722, 201068, 116045, 208725, 111341, 206637, 112400, 210256, 77529, 20844, 83798, 205523, 112428, 125474, 111343) allEnteralIronMedicationIds = set(poIronIngredientMedicationIds).union( poIronDirectMedicationIds) formatter = TextResultsFormatter(outputFile) colNames = ["pat_id", "ordering_date"] query = SQLQuery() for col in colNames: query.addSelect(col) query.addFrom("stride_order_med") query.addWhereIn("medication_id", allEnteralIronMedicationIds) query.addWhereIn("pat_id", patientById.viewkeys()) query.addOrderBy("pat_id") query.addOrderBy("ordering_date") DBUtil.execute(query, includeColumnNames=True, formatter=formatter)
def _get_random_patient_list(self): # Initialize DB cursor. cursor = self._connection.cursor() # Get average number of results for this lab test per patient. avg_orders_per_patient = self._get_average_orders_per_patient() log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Get numPatientsToQuery random patients who have gotten test. # TODO(sbala): Have option to feed in a seed for the randomness. query = SQLQuery() query.addSelect('pat_id') query.addFrom('stride_order_proc AS sop') query.addWhereIn('proc_code', [self._lab_panel]) query.addOrderBy('RANDOM()') query.setLimit(self._num_patients) log.debug('Querying random patient list...') results = DBUtil.execute(query) # Get patient list. random_patient_list = [ row[0] for row in results ] return random_patient_list
def create_large_files(raw_data_files, raw_data_folderpath, large_data_files, large_data_folderpath, num_repeats=100, USE_CACHED_DB=True): import os if os.path.exists(large_data_folderpath + '/' + large_data_files[0]): if USE_CACHED_DB: log.info("Large files exist!") return else: for large_data_file in large_data_files: os.remove(large_data_folderpath + '/' + large_data_file) if 'labs' not in raw_data_files[0]: quit("Please place labs file as the beginning of raw_data_files!") # TODO: quit(int)? # query all_pat_ids from raw_data. Presumably small files, so should not be problem with open(raw_data_folderpath + '/' + raw_data_files[0]) as f: lines_lab = f.readlines() f.close() all_pat_ids = set( [line_str2list(line)[1] for line in lines_lab[1:]]) # set([line.split('|')[1] for line in lines[1:]]) # Each time, perturb pat_ids in a specific random way, and modify all tables accordingly... for _ in range(num_repeats): # Create a different perturbation rule each time my_dict = {} for pat_id in all_pat_ids: my_dict[pat_id] = perturb_str(pat_id) # For each perturbation, perturb all tables for ind in range(len(raw_data_files)): raw_file_path = raw_data_folderpath + '/' + raw_data_files[ind] target_file_path = large_data_folderpath + '/' + large_data_files[ind] perturb_a_file(raw_file_path, target_file_path, col_patid=1, my_dict=my_dict)
def _add_features(self): # Add lab panel order features. if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': self._factory.addClinicalItemFeatures([self._lab_var], features="pre", isLabPanel=self._isLabPanel) else: # TODO: naming self._factory.addClinicalItemFeatures_UMich( [self._lab_var], features="pre", clinicalItemType=self._varTypeInTable, clinicalItemTime='order_time', tableName='labs') #sx # Add lab component result features, for a variety of time deltas. LAB_PRE_TIME_DELTAS = [datetime.timedelta(-14)] LAB_POST_TIME_DELTA = datetime.timedelta(0) log.info('Adding lab component features...') for pre_time_delta in LAB_PRE_TIME_DELTAS: log.info('\t%s' % pre_time_delta) self._factory.addLabResultFeatures(self._lab_components, False, pre_time_delta, LAB_POST_TIME_DELTA) FeatureMatrix._add_features(self, index_time_col='order_time')
def queryPatients(period, locations, rxCount): log.info( "Select patients fitting criteria in designated time period: (%s,%s)" % period) query = SQLQuery() query.addSelect("med.pat_id") query.addSelect("count(order_med_id)") query.addFrom("stride_mapped_meds as map") query.addFrom("stride_order_med as med") query.addFrom("stride_patient as pat") query.addWhere("analysis_status = 1") query.addWhere("map.medication_id = med.medication_id") query.addWhere("med.pat_id = pat.pat_id") query.addWhere("possible_oncology = 0") query.addWhereIn("patient_location", locations) query.addWhereOp("ordering_datetime", ">", period[0]) query.addWhereOp("ordering_datetime", "<", period[-1]) query.addGroupBy("med.pat_id") query.addHaving("count(order_med_id) >2") results = DBUtil.execute(query) cols = ["patientId", "nOpioidRx"] patientDF = pd.DataFrame(results, columns=cols) #patientDF.set_index("patientId",drop=False,inplace=True); patientDF["periodStart"] = period[0] # Identify this group of patient records return patientDF
def test_train_and_predict(self): # Load data set. X = DataFrame(RANDOM_CLASSIFICATION_TEST_CASE['X'], columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']) y = DataFrame(RANDOM_CLASSIFICATION_TEST_CASE['y']) random_state = RANDOM_CLASSIFICATION_TEST_CASE['random_state'] expected_y_pred_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['y_predicted'] expected_str_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['str'] expected_hyperparams_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['hyperparams'] expected_params_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['params'] expected_descriptions_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['description'] # Generate train/test split. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state) # Iterate through SUPPORTED_ALGORITHMS. for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS: log.info('Testing %s classifier...' % algorithm) # Train model. hyperparams = {'algorithm': algorithm, 'random_state': random_state} # Default to stochastic search for expensive algorithms. if algorithm in [SupervisedClassifier.RANDOM_FOREST]: hyperparams['hyperparam_strategy'] = SupervisedClassifier.STOCHASTIC_SEARCH # Test ability to force hyperparam values. hyperparams['max_depth'] = 2 hyperparams['n_estimators'] = 5 hyperparams['min_samples_leaf'] = 1 hyperparams['min_samples_split'] = 0.2 else: hyperparams['hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH classifier = SupervisedClassifier([0, 1], hyperparams) classifier.train(X_train, y_train) # Test str(). expected_str = expected_str_by_algorithm[algorithm] actual_str = str(classifier) self.assertEqual(expected_str, actual_str) # Test hyperparameters. expected_hyperparams = expected_hyperparams_by_algorithm[algorithm] actual_hyperparams = classifier.hyperparams() self._assert_equal_hyperparams(expected_hyperparams, actual_hyperparams) # Test model parameters. expected_params = expected_params_by_algorithm[algorithm] actual_params = classifier.params() self.assertEqualDict(expected_params, actual_params) # Test model description. expected_description = expected_descriptions_by_algorithm[algorithm] actual_description = classifier.description() self.assertEqual(expected_description, actual_description) # Test prediction values. expected_y_pred = expected_y_pred_by_algorithm[algorithm] log.debug('expected_y_pred: %s' % expected_y_pred) actual_y_pred = classifier.predict(X_test) log.debug('actual_y_pred: %s' % actual_y_pred) self.assertEqualList(expected_y_pred, actual_y_pred)
def addLabFeatures(labsByBaseNameByPatientId, patientById, colNames, indexItemBaseName, labBaseNames, labPreTime, labPostTime): log.info("Sort lab results by result time for each patient and find items within specified time period to aggregate"); prog = ProgressDots(); for iPatient, (patientId, labsByBaseName) in enumerate(labsByBaseNameByPatientId.iteritems()): # Look for the first result of the index item (ferritin) indexItem = None; if indexItemBaseName in labsByBaseName: for labResult in labsByBaseName[indexItemBaseName]: if indexItem is None or labResult["result_time"] < indexItem["result_time"]: indexItem = labResult; if indexItem is not None: # Skip this patient if no index item found, should not be possible since pre-screened for relevant patients indexTime = indexItem["result_time"]; patient = patientById[patientId]; patient["index_time"] = indexTime; preTimeLimit = indexTime+labPreTime; postTimeLimit = indexTime+labPostTime; # Init values for each lab of interest to an empty list for labBaseName in labBaseNames: # Default to null for all values patient["%s.min" % labBaseName] = None; patient["%s.max" % labBaseName] = None; patient["%s.median" % labBaseName] = None; patient["%s.mean" % labBaseName] = None; patient["%s.std" % labBaseName] = None; patient["%s.first" % labBaseName] = None; patient["%s.last" % labBaseName] = None; patient["%s.proximate" % labBaseName] = None; proximateValue = None; if labBaseName in labsByBaseName: # Not all patients will have all labs checked proximateItem = None; # Item closest to the index item in time valueList = list(); for labResult in labsByBaseName[labBaseName]: resultTime = labResult["result_time"]; if preTimeLimit <= resultTime and resultTime < postTimeLimit: # Occurs within time frame of interest, so record this value valueList.append(labResult["ord_num_value"]); if proximateItem is None or (abs(resultTime-indexTime) < abs(proximateItem["result_time"]-indexTime)): proximateItem = labResult; proximateValue = proximateItem["ord_num_value"]; if len(valueList) > 0: patient["%s.min" % labBaseName] = np.min(valueList); patient["%s.max" % labBaseName] = np.max(valueList); patient["%s.median" % labBaseName] = np.median(valueList); patient["%s.mean" % labBaseName] = np.mean(valueList); patient["%s.std" % labBaseName] = np.std(valueList); patient["%s.first" % labBaseName] = valueList[0]; # Assumes previously sorted patient["%s.last" % labBaseName] = valueList[-1]; # by result_time patient["%s.proximate" % labBaseName] = proximateValue; prog.update(); colNames.extend(colsFromLabBaseNames(labBaseNames)); prog.printStatus();
def clear_stride_psql_tables(): log.info('Clearing stride psql tables...') for params in list(STRIDE_LOADER_PARAMS.values()): psql_table = params['psql_table'] % TABLE_PREFIX log.debug('dropping table %s...' % psql_table) # load_stride_to_psql is not itempotent, so in case schema already # existed, clear table (avoid duplicate data). DBUtil.execute("DROP TABLE IF EXISTS %s CASCADE;" % psql_table)
def define_virtual_clinical_items(): log.info('Defining virtual clinical items...') # READMISSION # Readmission defined as a discharge --> admission. # First, define a clinical item for Readmission. admission_cic_id_query = \ """ SELECT clinical_item_category_id FROM clinical_item_category WHERE description = 'Admission'; """ results = DBUtil.execute(admission_cic_id_query) admission_cic_id = results[0][0] readmission_definition_command = \ """ INSERT INTO clinical_item ( clinical_item_category_id, name, description, analysis_status ) VALUES ( %s, 'READT', 'Readmission', 1 ); """ % admission_cic_id DBUtil.execute(readmission_definition_command) # Second, get clinical_item_id for admission, discharge, & readmission. admission_ci_id_query = \ """ SELECT clinical_item_id FROM clinical_item WHERE name = 'AnyAdmit'; """ results = DBUtil.execute(admission_ci_id_query) admission_ci_id = results[0][0] discharge_ci_id_query = \ """ SELECT clinical_item_id FROM clinical_item WHERE name = 'ADT12'; """ results = DBUtil.execute(discharge_ci_id_query) discharge_ci_id = results[0][0] readmission_ci_id_query = \ """ SELECT clinical_item_id FROM clinical_item WHERE name = 'READT'; """ results = DBUtil.execute(readmission_ci_id_query) readmission_ci_id = results[0][0] # Third, use TripleAssociationAnalysis to build a new virtual item. item_sequence = [discharge_ci_id, admission_ci_id] ClinicalItemDataLoader.build_virtual_clinical_item( item_sequence, readmission_ci_id)
def raw2db(self, data_file, data_folderpath, db_path, db_name, build_index_patid=True): chunk_size = 100000 # num of rows print('Now writing %s into database...' % data_file) # generated_tables = [] with open(data_folderpath + '/' + data_file) as f: is_first_chunk = True while True: # a chunk of rows next_n_lines_str = list(islice(f, chunk_size)) if not next_n_lines_str: break if is_first_chunk: colnames = utils_NonSTRIDE.line_str2list( next_n_lines_str[0], test_mode=False) data_df = self.lines2pd(next_n_lines_str[1:], colnames) #, params_str2list) is_first_chunk = False else: ## make each chunk into pandas data_df = self.lines2pd(next_n_lines_str, colnames) # params_str2list) ## append each pandas to db tables if LocalEnv.DATASET_SOURCE_NAME == 'UMich': df_name = data_file.replace(".txt", "") df_name = df_name.replace(".sample", "") df_name = df_name.replace(".test", "") df_name = df_name.replace(".large", "") df_name = df_name.replace('.', '_') # pt.info elif LocalEnv.DATASET_SOURCE_NAME == 'UCSF': df_name = data_file.replace(".tsv", "") # df_name = df_name.replace("_deident", "") df_name = df_name.replace('.', '_') generated_tables += self.pd2db(data_df, df_name=df_name, db_path=db_path, db_name=db_name) ## if build_index_patid: conn = sqlite3.connect(db_path + '/' + db_name) for generated_table in generated_tables: build_index_query = "CREATE INDEX IF NOT EXISTS index_for_%s ON %s (%s);" % ( generated_table, generated_table, 'pat_id') log.info(build_index_query) # print build_index_query conn.execute(build_index_query)
def main(argv=None): timer = time.time() # Final columns to output to patient matrix colNames = list() patientById = parsePatientFile(stdOpen("patients.tab"), colNames) labsByBaseNameByPatientId = parseLabResultsFile(stdOpen("labs.tab")) addLabFeatures(labsByBaseNameByPatientId, patientById, colNames, INDEX_ITEM_BASE_NAME, LAB_BASE_NAMES, LAB_PRE_TIME, LAB_POST_TIME) log.info( "Record presence of items in terms of relative time to each item from index time" ) itemTimesByPatientId = parseClinicalItemFile(stdOpen("admitDx.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ICD9.208-AdmitDx") itemTimesByPatientId = parseClinicalItemFile(stdOpen("problemListDx.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ICD9.208-ProblemListDx") itemTimesByPatientId = parseClinicalItemFile(stdOpen("feSO4Rx.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ironSO4") itemTimesByPatientId = parseClinicalItemFile(stdOpen("allEnteralIron.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ironEnteral") itemTimesByPatientId = parseClinicalItemFile(stdOpen("ironIV.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ironIV") itemTimesByPatientId = parseClinicalItemFile( stdOpen("outpatientIronRx.tab"), patientIdCol="pat_id", timeCol="ordering_date") addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ironOutpatient") itemTimesByPatientId = parseClinicalItemFile(stdOpen("transfusions.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "RBCTransfusion") patientResults = filterPatients(patientById) log.info("Output feature matrix file with row per patient") featureMatrixFile = stdOpen("featureMatrix.lab14to1day.tab", "w") formatter = TextResultsFormatter(featureMatrixFile) formatter.formatResultDicts(patientResults, colNames, addHeaderRow=True) timer = time.time() - timer print("%.3f seconds to complete" % timer, file=sys.stderr)
def filterPatients(patientById): log.info("Deidentify patient IDs and build data list with adequate data"); patientResults = list(); for iPatient, patient in enumerate(patientById.itervalues()): # Further deidentify patients by applying sequential ID patient["pat_id"] = patient["patient_id"] = iPatient; # Only accept patients where an index item and times were found if "index_time" in patient: patientResults.append(patient); return patientResults;
def parsePatientFile(patientFile, colNames): log.info("Parse patient file"); patientFile = stdOpen("patients.tab"); patientById = dict(); for patient in TabDictReader(patientFile): patientId = int(patient["patient_id"]); patient["patient_id"] = patientId; patientById[patientId] = patient; colNames.extend(["patient_id","dialysis","surgery"]); return patientById;
def _tune_hyperparams_regress_and_round(self, X, y): self._hyperparams['hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH log.info('Tuning hyperparams via %s...' % self._hyperparams['hyperparam_strategy']) # If not provided, search for best coef_max. if self._hyperparams.get('coef_max') is None: self._hyperparams['coef_max'] = self._tune_coef_max(X, y) # Round linear coefficients. self._round_coefs(self._hyperparams['coef_max']) log.debug('hyperparams: %s' % self.hyperparams()) log.debug('params: %s' % self.params())
def _add_features(self): # Add lab panel order features. self._factory.addClinicalItemFeatures([self._lab_panel], features="pre") # Add lab component result features, for a variety of time deltas. LAB_PRE_TIME_DELTAS = [datetime.timedelta(-14)] LAB_POST_TIME_DELTA = datetime.timedelta(0) log.info('Adding lab component features...') for pre_time_delta in LAB_PRE_TIME_DELTAS: log.info('\t%s' % pre_time_delta) self._factory.addLabResultFeatures(self._lab_components, False, pre_time_delta, LAB_POST_TIME_DELTA) FeatureMatrix._add_features(self, index_time_col='order_time')
def train(self, X, y, groups=None): self._groups = groups assert ('pat_id' not in X.columns) self._features = X.columns y = self._maybe_reshape_y(y) # Verify that there are at least 2 samples of each class. value_counts = Series(y).value_counts() log.debug('y.value_counts(): %s' % value_counts) for class_label in self._classes: # If there aren't 2+ samples of each class, exit gracefully. try: num_samples = value_counts[class_label] if num_samples < 10: log.error('Insufficient samples (%s) of label %s.' % (num_samples, class_label)) return SupervisedClassifier.INSUFFICIENT_SAMPLES except KeyError: log.error('Insufficient samples (0) of label %s.' % class_label) return SupervisedClassifier.INSUFFICIENT_SAMPLES log.info('Training %s classifier...' % self._hyperparams['algorithm']) if self._hyperparams[ 'algorithm'] == SupervisedClassifier.DECISION_TREE: self._train_decision_tree(X, y) elif self._hyperparams[ 'algorithm'] == SupervisedClassifier.LOGISTIC_REGRESSION: self._train_logistic_regression(X, y) elif self._hyperparams[ 'algorithm'] == SupervisedClassifier.RANDOM_FOREST: self._train_random_forest(X, y) elif self._hyperparams[ 'algorithm'] == SupervisedClassifier.REGRESS_AND_ROUND: self._train_regress_and_round(X, y) elif self._hyperparams['algorithm'] == SupervisedClassifier.ADABOOST: self._train_adaboost(X, y) elif self._hyperparams[ 'algorithm'] == SupervisedClassifier.GAUSSIAN_NAIVE_BAYES: self._train_gaussian_naive_bayes(X, y) elif self._hyperparams['algorithm'] == SupervisedClassifier.SVM: self._train_svm(X, y) elif self._hyperparams['algorithm'] == SupervisedClassifier.XGB: self._train_xgb(X, y) elif self._hyperparams['algorithm'] == SupervisedClassifier.NN: self._train_nn(X, y) return SupervisedClassifier.TRAINED
def main(argv): """Main method, callable from command line""" usage_str = "usage: %prog [options] <inputJsonDataFolder> <outputFile>\n" \ " <outputFile> CSV file with the usage report." parser = OptionParser(usage=usage_str) parser.add_option( "-g", "--graders", dest="graders", help="Comma-separated list of graders to use for grading") parser.add_option( "-s", "--survey", dest="survey_file", help="Path to a survey CSV file (used for adding 'resident' column)") (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() summary_data = {"argv": argv} grader_ids = set() if not options.graders: # graders is a mandatory parameter print("No graders given. Cannot grade patient cases. Exiting.\n") parser.print_help() sys.exit() else: grader_ids.update(options.graders.split(VALUE_DELIM)) survey_file = None if options.survey_file: survey_file = options.survey_file if len(args) < 2: # we need input and output files given print("Given parameters are not enough. Exiting.\n") parser.print_help() sys.exit() input_folder = args[0] output_filename = args[1] # Print comment line with arguments to allow for deconstruction later as well as extra results print(COMMENT_TAG, json.dumps(summary_data)) make_usage_report(input_folder, grader_ids, output_filename, survey_file) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def queryDemographics(patientDF, baseDate): log.info("Populate demographics background for %d patients" % len(patientDF) ); query = SQLQuery(); query.addSelect("pat_id"); query.addSelect("%d-birth_year as age" % baseDate.year ); query.addSelect("gender"); query.addSelect("primary_race"); query.addFrom("stride_patient"); query.addWhereIn("pat_id", patientDF["patientId"] ); results = DBUtil.execute(query); cols = ["patientId","age","gender","race"]; newDF = pd.DataFrame(results,columns=cols); return patientDF.merge(newDF, how="left");
def _tune_hyperparams(self, hyperparam_search_space, X, y): log.info('Tuning hyperparameters via %s...' % self._hyperparams['hyperparam_strategy']) log.debug('hyperparam_search_space: %s' % str(hyperparam_search_space)) # Log the pre-tuning score. self._get_or_set_hyperparam('cv', y) log.debug('initial hyperparams: %s' % self._hyperparams) pre_tuning_score = np.mean(cross_val_score(self._model, X, y, \ cv=self._hyperparams['cv'], \ groups=self._groups, \ scoring=self._hyperparams['scoring'], \ n_jobs=self._hyperparams['n_jobs'])) # Initialize hyperparam tuner. # Assume the model was initialized before this function. if self._hyperparams[ 'hyperparam_strategy'] == SupervisedClassifier.EXHAUSTIVE_SEARCH: tuner = GridSearchCV(self._model, hyperparam_search_space, \ scoring=self._hyperparams['scoring'], \ n_jobs=self._hyperparams['n_jobs'], \ iid=False, \ refit=True, \ cv=self._hyperparams['cv'], \ return_train_score=False) elif self._hyperparams[ 'hyperparam_strategy'] == SupervisedClassifier.STOCHASTIC_SEARCH: self._get_or_set_hyperparam('n_iter') tuner = RandomizedSearchCV(self._model, hyperparam_search_space, \ scoring=self._hyperparams['scoring'], \ n_iter=self._hyperparams['n_iter'], \ n_jobs=self._hyperparams['n_jobs'], \ iid=False, \ refit=True, \ cv=self._hyperparams['cv'], \ random_state=self._hyperparams['random_state'], \ return_train_score=False) tuner.fit(X, y, groups=self._groups) # Set model and hyperparams. self._model = tuner.best_estimator_ for key in tuner.best_params_.keys(): log.debug('tune(%s): %s --> %s' % (key, self._hyperparams[key], \ tuner.best_params_[key])) self._hyperparams[key] = tuner.best_params_[key] log.debug('tune(%s): %s --> %s' % (self._hyperparams['scoring'],\ pre_tuning_score, tuner.best_score_)) log.debug('hyperparams: %s' % self._hyperparams) log.debug('params: %s' % self.params())
def queryClinicalItems(outputFile, clinicalItemIds, patientById): log.info("Query Clinical Items: %s" % str(clinicalItemIds)) formatter = TextResultsFormatter(outputFile) colNames = ["patient_id", "item_date"] query = SQLQuery() for col in colNames: query.addSelect(col) query.addFrom("patient_item") query.addWhereIn("clinical_item_id", clinicalItemIds) query.addWhereIn("patient_id", patientById.viewkeys()) query.addOrderBy("patient_id") query.addOrderBy("item_date") DBUtil.execute(query, includeColumnNames=True, formatter=formatter)
def main(self, argv): """Main method, callable from command line""" usageStr = "Query for the clinical_item records that exist with the specified criteria\n"+\ "usage: %prog [options] [<outputFile>]\n"+\ " <outputFile> Results file. Leave blank or specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-i", "--itemPrefix", dest="itemPrefix", help= "Look for clinical_items whose description starts with this prefix." ) parser.add_option( "-c", "--categoryNames", dest="categoryNames", help= "Comma separated list of clinical_item_category.descriptions to look for." ) parser.add_option( "-p", "--pauseSeconds", dest="pauseSeconds", default="0", help="Number of seconds to pause between processing each record.") (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) > 0: outputFile = stdOpen(args[0], "w") # Print comment line with arguments to allow for deconstruction later as well as extra results summaryData = { "argv": argv } print(COMMENT_TAG, json.dumps(summaryData), file=outputFile) self.queryItems(options, outputFile) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def load_stride_to_psql(): # Build clean data files. StrideLoader.build_clean_csv_files() # Build psql schemata. StrideLoader.build_stride_psql_schemata() # Build paths to clean data files. clean_data_dir = StrideLoader.fetch_clean_data_dir() for raw_file in sorted(STRIDE_LOADER_PARAMS.keys()): params = STRIDE_LOADER_PARAMS[raw_file] # Build clean data file. clean_file = params['clean_file'] % TABLE_PREFIX log.info('loading %s...' % clean_file) clean_path = os.path.join(clean_data_dir, clean_file) # Uncompress data file. unzipped_clean_path = clean_path[:-3] with gzip.open(clean_path, 'rb') as f_in, open(unzipped_clean_path, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # psql COPY data from clean files into DB. psql_table = params['psql_table'] % TABLE_PREFIX log.debug('stride/data/clean/%s ==> %s' % (clean_file, psql_table)) # In some cases, two files going to the same table will have # non-identical column names. Pass these explicitly so that # psql knows which columns to try to fill from file. # Strip the newline character. with open(unzipped_clean_path, 'r') as f_in: columns = f_in.readline()[:-1] command = "COPY %s (%s) FROM '%s' WITH (FORMAT csv, HEADER);" % ( psql_table, columns, unzipped_clean_path) DBUtil.execute(command) # Delete unzipped_clean_path. os.remove(unzipped_clean_path) # Run any one-off postprocessing transformations which all users # of the STRIDE database should receive. Defer any application-specific # transformations to other modules. StrideLoader.process_stride_psql_db() # Build indices. StrideLoader.build_stride_psql_indices()
def _get_random_patient_list(self): #sx: this function is for avoid RANDOM() on the database cursor = self._connection.cursor() # Get average number of results for this lab test per patient. query = SQLQuery() query.addSelect('pat_id') query.addSelect('COUNT(sop.order_proc_id) AS num_orders') query.addFrom('stride_order_proc AS sop') query.addFrom('stride_order_results AS sor') query.addWhere('sop.order_proc_id = sor.order_proc_id') ## query.addWhereIn("base_name", [self._component]) query.addGroupBy('pat_id') log.debug('Querying median orders per patient...') results = DBUtil.execute(query) order_counts = [ row[1] for row in results ] if len(results) == 0: error_msg = '0 orders for component "%s."' % self._component #sx log.critical(error_msg) sys.exit('[ERROR] %s' % error_msg) else: avg_orders_per_patient = numpy.median(order_counts) log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Some components may have fewer associated patients than the required sample size patient_number_chosen = min([len(results),self._num_patients]) # inds_random_patients = numpy.random.choice(len(results), size=patient_number_chosen, replace=False) # print 'inds_random_patients:', inds_random_patients pat_IDs_random_patients = [] for ind in inds_random_patients: pat_IDs_random_patients.append(results[ind][0]) # print pat_IDs_random_patients return pat_IDs_random_patients
def _add_features(self): # Add past susceptibility readings self._add_susc_features() # Add past antibiotic use as features self._add_med_features() # Add lab panel order features. self._factory.addClinicalItemFeatures(self._lab_panel, features="pre") # Add lab component result features, for a variety of time deltas. LAB_PRE_TIME_DELTAS = [datetime.timedelta(-14)] LAB_POST_TIME_DELTA = datetime.timedelta(0) log.info('Adding lab component features...') for pre_time_delta in LAB_PRE_TIME_DELTAS: log.info('\t%s' % pre_time_delta) self._factory.addLabResultFeatures(self._lab_components, False, pre_time_delta, LAB_POST_TIME_DELTA) FeatureMatrix._add_features(self, index_time_col='shifted_order_time')
def queryDrugScreens( patientDF, period, locations ): log.info("Populate drug screens by primary locations"); query = SQLQuery(); query.addSelect("pat_id"); query.addSelect("count(distinct order_proc_id)"); query.addFrom("stride_order_proc_drug_screen"); query.addWhere("ordering_mode = 'Outpatient'"); query.addWhereIn("patient_location", locations ); query.addWhereOp("ordering_date",">", period[0]); query.addWhereOp("ordering_date","<", period[-1]); query.addWhereIn("pat_id", patientDF["patientId"] ); query.addGroupBy("pat_id"); results = DBUtil.execute(query); cols = ["patientId","nDrugScreens"]; newDF = pd.DataFrame(results,columns=cols); patientDF = patientDF.merge(newDF, how="left"); patientDF["nDrugScreens"][np.isnan(patientDF["nDrugScreens"])] = 0; # Populate default values if no data patientDF["nDrugScreens"] = patientDF["nDrugScreens"].astype("int"); # Beware of float conversion somewhere return patientDF;
def main_quickTest(argv): modelFilename = argv[1] modeler = TopicModel() timer = time.time() (model, docCountByWordId) = modeler.loadModelAndDocCounts(modelFilename) timer = time.time() - timer log.info("%.2f seconds to load", timer) timer = time.time() weightByItemIdByTopicId = modeler.generateWeightByItemIdByTopicId( model, 100) timer = time.time() - timer log.info("%.2f seconds to generate weights", timer) for i in xrange(3): prog = ProgressDots() for (topicId, weightByItemId) in weightByItemIdByTopicId.iteritems(): for (itemId, itemWeight) in weightByItemId.iteritems(): prog.update() prog.printStatus() """