class FeatureMatrix: def __init__(self, variable, num_data_points, params=None): # Process arguments. self._var = variable self._num_rows = num_data_points if params is None: self._params = {} else: self._params = params # Initialize FeatureMatrixFactory. self._factory = FeatureMatrixFactory() # Initialize DB connection. self._connection = DBUtil.connection() def _query_patient_episodes(self, query, pat_id_col=None, index_time_col=None): # Initialize DB cursor. cursor = self._connection.cursor() # Fetch and return results. log.info('query: %s' % str(query)) if isinstance(query, basestring): cursor.execute(query) else: log.info('query.params: %s' % str(query.params)) cursor.execute(str(query), query.params) # Parse arguments. if pat_id_col is None: pat_id_col = 'pat_id' if index_time_col is None: index_time_col = 'index_time' self._factory.setPatientEpisodeInput(cursor, pat_id_col, index_time_col) num_episodes = self._factory.processPatientEpisodeInput() return num_episodes def _querystr_patient_episodes(self, querystr, pat_id_col=None, index_time_col=None): # Initialize DB cursor. cursor = self._connection.cursor() # Fetch and return results. log.info('query: %s' % querystr) cursor.execute(querystr) # Parse arguments. if pat_id_col is None: pat_id_col = 'pat_id' if index_time_col is None: index_time_col = 'index_time' self._factory.setPatientEpisodeInput(cursor, pat_id_col, index_time_col) num_episodes = self._factory.processPatientEpisodeInput() return num_episodes def _add_features(self, index_time_col=None): if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': self._add_time_features(index_time_col) self._add_demographic_features() self._add_treatment_team_features() self._add_comorbidity_features() self._add_flowsheet_features() self._add_lab_component_features() else: # elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': self._add_time_features(index_time_col) self._add_demographic_features() self._add_comorbidity_features() self._add_lab_component_features() if LocalEnv.DATASET_SOURCE_NAME == 'UCSF': self._add_treatment_team_features() self._add_flowsheet_features() def _add_time_features(self, index_time_col=None): log.info('Adding admit date features...') # Add admission date. ADMIT_DX_CATEGORY_ID = 2 if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': self._factory.addClinicalItemFeaturesByCategory([ADMIT_DX_CATEGORY_ID], \ dayBins=[], label='AdmitDxDate', features='pre') else: #elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': self._factory.addClinicalItemFeaturesByCategory_UMich([ADMIT_DX_CATEGORY_ID], \ dayBins=[], label='AdmitDxDate', features='pre', tableName='encounters') # Add time cycle features. log.info('Adding time cycle features...') if index_time_col is None: index_time_col = 'index_time' self._factory.addTimeCycleFeatures(index_time_col, 'month') self._factory.addTimeCycleFeatures(index_time_col, 'hour') def _add_demographic_features(self): log.info('Adding demographic features...') # Add birth and death. self._add_lifespan_features() # Add sex features. self._add_sex_features() # Add race features. self._add_race_features() def _add_lifespan_features(self): log.info('Adding lifespan features...') if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': # TODO self._factory.addClinicalItemFeatures(['Birth'], dayBins=[], features="pre") self._factory.addClinicalItemFeatures(['Death'], dayBins=[], features="post") else: #elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': self._factory.addClinicalItemFeatures_UMich( ['Birth'], dayBins=[], features="pre", clinicalItemType=None, clinicalItemTime='Birth', tableName='pt_info') def _add_sex_features(self): log.info('Adding sex features...') SEX_FEATURES = ["Male", "Female"] for feature in SEX_FEATURES: if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': # TODO self._factory.addClinicalItemFeatures([feature], dayBins=[], features="pre") else: #elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': self._factory.addClinicalItemFeatures_UMich( [feature], dayBins=[], features="pre", clinicalItemType='GenderName', clinicalItemTime=None, tableName="demographics") def _add_race_features(self): log.info('Adding race features...') for feature in self._factory.queryAllRaces(): if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': # TODO self._factory.addClinicalItemFeatures([feature], dayBins=[], features="pre") else: #elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': self._factory.addClinicalItemFeatures_UMich( [feature], dayBins=[], features="pre", clinicalItemType='RaceName', clinicalItemTime=None, tableName='demographics') def _add_treatment_team_features(self): log.info('Adding treatment team features...') self._factory.addTreatmentTeamFeatures(features="pre") def _add_comorbidity_features(self): log.info('Adding comorbidity features...') self._factory.addCharlsonComorbidityFeatures(features='pre') def _add_flowsheet_features(self): log.info('Adding flowsheet features...') # Look at flowsheet results from the previous days FLOW_PRE_TIME_DELTAS = [datetime.timedelta(-3)] # Don't look into the future, otherwise cheating the prediction FLOW_POST_TIME_DELTA = datetime.timedelta(0) # Add flowsheet features for a variety of generally useful vitals. if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': BASIC_FLOWSHEET_FEATURES = [ "BP_High_Systolic", "BP_Low_Diastolic", "FiO2", "Glasgow Coma Scale Score", "Pulse", "Resp", "Temp", "Urine" ] elif LocalEnv.DATASET_SOURCE_NAME == 'UCSF': BASIC_FLOWSHEET_FEATURES = [ 'SBP', 'DBP', 'FiO2', 'Pulse', 'Resp', 'Temp', 'o2flow' ] for pre_time_delta in FLOW_PRE_TIME_DELTAS: log.info('\t\tpreTimeDelta: %s' % pre_time_delta) self._factory.addFlowsheetFeatures(BASIC_FLOWSHEET_FEATURES, \ pre_time_delta, FLOW_POST_TIME_DELTA) def _add_lab_component_features(self): # Look for lab data 90 days before each episode, but never after self. # Look at lab results from the previous days LAB_PRE_TIME_DELTAS = [datetime.timedelta(-14)] # Don't look into the future, otherwise cheating the prediction LAB_POST_TIME_DELTA = datetime.timedelta(0) # Add result features for a variety of generally useful components. if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': BASIC_LAB_COMPONENTS = [ 'WBC', # White Blood Cell 'HCT', # Hematocrit 'PLT', # Platelet Count 'NA', # Sodium, Whole Blood 'K', # Potassium, Whole Blood 'CO2', # CO2, Serum/Plasma 'BUN', # Blood Urea Nitrogen 'CR', # Creatinine 'TBIL', # Total Bilirubin 'ALB', # Albumin 'CA', # Calcium 'LAC', # Lactic Acid 'ESR', # Erythrocyte Sedimentation Rate 'CRP', # C-Reactive Protein 'TNI', # Troponin I 'PHA', # Arterial pH 'PO2A', # Arterial pO2 'PCO2A', # Arterial pCO2 'PHV', # Venous pH 'PO2V', # Venous pO2 'PCO2V' # Venous pCO2 ] elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': BASIC_LAB_COMPONENTS = [ 'WBC', # White Blood Cell 'HCT', # Hematocrit 'PLT', # Platelet Count 'SOD', # Sodium, Whole Blood 'POT', # Potassium, Whole Blood 'CO2', # CO2, Serum/Plasma 'UN', # Blood Urea Nitrogen 'CREAT', # Creatinine 'TBIL', # Total Bilirubin 'ALB', # Albumin 'CAL', # Calcium 'LACTA', # Lactic Acid; LACTA & LACTV are more frequent "WEST", # Erythrocyte Sedimentation Rate 'CRP', # C-Reactive Protein 'TROP', # Troponin I 'pHA', # Arterial pH 'PO2AA', # Arterial pO2 'PCOAA2', # Arterial pCO2 'pHV', # Venous pH 'pO2V', # Venous pO2 'pCO2V', # Venous pCO2 ] elif LocalEnv.DATASET_SOURCE_NAME == 'UCSF': BASIC_LAB_COMPONENTS = [ 'WBC', # White Blood Cell 'HCT', # Hematocrit 'PLT', # Platelet Count 'NAWB', # Sodium, Whole Blood 'K', # Potassium, Whole Blood 'CO2', # CO2, Serum/Plasma 'BUN', # Blood Urea Nitrogen 'CREAT', # Creatinine 'TBILI', # Total Bilirubin 'ALB', # Albumin 'CA', # Calcium 'LACTWB', # Lactic Acid; LACTA & LACTV are more frequent "ESR", # Erythrocyte Sedimentation Rate 'CRP', # C-Reactive Protein 'TRPI', # Troponin I 'PH37', # Arterial pH 'PO2', # Arterial pO2 'PCO2' # Arterial pCO2 # 'pHV', # Venous pH # 'pO2V', # Venous pO2 # 'pCO2V', # Venous pCO2 ] log.info('Adding lab component features...') for component in BASIC_LAB_COMPONENTS: log.info('\t%s' % component) for preTimeDelta in LAB_PRE_TIME_DELTAS: log.info('\t\t%s' % preTimeDelta) self._factory.addLabResultFeatures([component], False, preTimeDelta, LAB_POST_TIME_DELTA) def _build_matrix(self, header=None, dest_path=None): log.info('Building matrix...') self._factory.buildFeatureMatrix(header, dest_path) def write_matrix(self, dest_path, header=None): log.info('Writing matrix file...') fm_io = FeatureMatrixIO() # Get old matrix file. source_path = self._factory.getMatrixFileName() # Write to new matrix filee. matrix_file = open(dest_path, 'w') # for line in header: # matrix_file.write('# %s\n' % line) for line in open(source_path, 'r'): if line[0] != '#': matrix_file.write(line) # Delete old matrix file. os.remove(source_path) def _build_matrix_header(self, params=None): # params['include_lab_suffix_summary'] = True/False # params['include_clinical_item_suffix_summary'] = True/False # params['data_overview'] = str description. # params['field_summary'] = str description header = list() file_summary = self._build_file_summary(params['matrix_path'], \ params['matrix_module']) header.extend(file_summary) header.extend(['']) if params.get('data_overview'): header.extend(params['data_overview']) header.extend(['']) if params.get('field_summary'): header.extend(params['field_summary']) header.extend(['']) if params.get('include_clinical_item_suffix_summary'): ci_suffix_summary = self._build_clinical_item_suffix_summary() header.extend(ci_suffix_summary) header.extend(['']) if params.get('include_lab_suffix_summary'): lab_suffix_summary = self._build_flowsheet_and_lab_result_suffix_summary( ) header.extend(lab_suffix_summary) header.extend(['']) return header def _build_file_summary(self, matrix_path, matrix_module): summary = list() # <file_name.tab> matrix_name = matrix_path.split('/')[-1] summary.append(matrix_name) # Created: <timestamp> timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") summary.append('Created: %s' % timestamp) # Source: __name__ module_name = matrix_module.split('/')[-1] summary.append('Source: %s' % module_name) # Command: Pipeline() class_name = module_name.split('.')[0] args = [self._var, str(self._num_rows)] for key, value in self._params: args.append('%s=%s' % (key, value)) command = '%s(%s)' % (class_name, ', '.join(args)) summary.append('Command: %s' % command) return summary def _build_clinical_item_suffix_summary(self): summary = list() # [clinical_item] fields may have the following suffixes: summary.append( ' [clinical_item] fields may have the following suffixes:') # ___.pre - how many times has this occurred before order_time? summary.append( ' ___.pre - how many times has this occurred before order_time?' ) # ___.pre.Xd - how many times has this occurred within X days before index_time? summary.append( ' ___.pre.Xd - how many times has this occurred within X days before index_time?' ) # ___.preTimeDays - how many days before order_time was last occurrence? summary.append( ' ___.preTimeDays - how many days before order_time was last occurrence?' ) return summary def _build_flowsheet_and_lab_result_suffix_summary(self): summary = list() # [flowsheet] and [lab_result] fields may have the following suffixes: summary.append( ' [flowsheet] and [lab_result] fields may have the following suffixes:' ) # ___.X_Y.count - # of result values between X and Y days of index_time. summary.append( ' ___.X_Y.count - # of result values between X and Y days of index_time.' ) # ___.X_Y.countInRange - # of result values in normal range. summary.append( ' ___.X_Y.countInRange - # of result values in normal range.') # ___.X_Y.min - minimum result value. summary.append(' ___.X_Y.min - minimum result value.') # ___.X_Y.max - maximum result value. summary.append(' ___.X_Y.max - maximum result value.') # ___.X_Y.median - median result value. summary.append(' ___.X_Y.median - median result value.') # ___.X_Y.std - standard deviation of result values. summary.append( ' ___.X_Y.std - standard deviation of result values.') # ___.X_Y.first - first result value. summary.append(' ___.X_Y.first - first result value.') # ___.X_Y.last - last result value. summary.append(' ___.X_Y.last - last result value.') # ___.X_Y.diff - difference between penultimate and proximate values. summary.append( ' ___.X_Y.diff - difference between penultimate and proximate values.' ) # ___.X_Y.slope - slope between penultimate and proximate values. summary.append( ' ___.X_Y.slope - slope between penultimate and proximate values.' ) # ___.X_Y.proximate - closest result value to order_time. summary.append( ' ___.X_Y.proximate - closest result value to order_time.') # ___.X_Y.firstTimeDays - time between first and order_time. summary.append( ' ___.X_Y.firstTimeDays - time between first and order_time.') # ___.X_Y.lastTimeDays - time between last and order_time. summary.append( ' ___.X_Y.lastTimeDays - time between last and order_time.') # ___.X_Y.proximateTimeDays - time between proximate and order_time. summary.append( ' ___.X_Y.proximateTimeDays - time between proximate and order_time.' ) return summary
class TestFeatureMatrixFactory(DBTestCase): def setUp(self): """Prepare state for test cases.""" DBTestCase.setUp(self) StrideLoader.build_stride_psql_schemata() ClinicalItemDataLoader.build_clinical_item_psql_schemata() self._deleteTestRecords() self._insertTestRecords() self.factory = FeatureMatrixFactory() self.connection = DBUtil.connection() # Setup a common connection for test cases to work with, can catch in finally tearDown method to close/cleanup def _insertTestRecords(self): """Populate database for with patient data.""" # Populate clinical_item_category. testRecords = FM_TEST_INPUT_TABLES.get("clinical_item_category") DBUtil.insertFile(StringIO(testRecords), "clinical_item_category", \ delim="\t") # Populate clinical_item. testRecords = FM_TEST_INPUT_TABLES.get("clinical_item") DBUtil.insertFile(StringIO(testRecords), "clinical_item", delim="\t") # Populate patient_item. testRecords = FM_TEST_INPUT_TABLES.get("patient_item") DBUtil.insertFile(StringIO(testRecords), "patient_item", delim="\t", \ dateColFormats={"item_date": None}) # Populate stride_order_proc. testRecords = FM_TEST_INPUT_TABLES.get("stride_order_proc") DBUtil.insertFile(StringIO(testRecords), "stride_order_proc", \ delim="\t", \ dateColFormats={"item_date": None}) # Populate stride_order_results. testRecords = FM_TEST_INPUT_TABLES.get("stride_order_results") DBUtil.insertFile(StringIO(testRecords), "stride_order_results", \ delim="\t", dateColFormats={"result_time": None}) # Populate stride_flowsheet. testRecords = FM_TEST_INPUT_TABLES.get("stride_flowsheet") DBUtil.insertFile(StringIO(testRecords), "stride_flowsheet", \ delim="\t", \ dateColFormats={"shifted_record_dt_tm": None}) # Populate stride_order_med. testRecords = FM_TEST_INPUT_TABLES.get("stride_order_med") DBUtil.insertFile(StringIO(testRecords), "stride_order_med", \ delim="\t", dateColFormats = {"start_taking_time": None, \ "end_taking_time": None}) def _deleteTestRecords(self): """Delete test records from database.""" DBUtil.execute("delete from stride_order_med where order_med_id < 0") DBUtil.execute("delete from stride_flowsheet where flo_meas_id < 0") DBUtil.execute( "delete from stride_order_results where order_proc_id < 0") DBUtil.execute("delete from stride_order_proc where order_proc_id < 0") DBUtil.execute("delete from patient_item where clinical_item_id < 0") # Must delete from clinical_item_assocatiation in order to make CDSS # test suite pass. Other suites may update this table. DBUtil.execute( "delete from clinical_item_association where clinical_item_id < 0") DBUtil.execute("delete from clinical_item where clinical_item_id < 0") DBUtil.execute( "delete from clinical_item_category where clinical_item_category_id < 0" ) def tearDown(self): """Restore state from any setUp or test steps.""" self._deleteTestRecords() # Clean up files that might have lingered from failed tests. try: os.remove("patient_list.tsv") except: pass try: self.factory.cleanTempFiles() except: pass try: os.remove(self.factory.getMatrixFileName()) except: pass try: os.remove("extractor.feature_matrix.tab.gz") except: pass self.connection.close() DBTestCase.tearDown(self) def test_dbCache(self): """Test database result caching.""" factory = FeatureMatrixFactory(cacheDBResults=False) self.assertEqual(factory.dbCache, None) factory = FeatureMatrixFactory() self.assertEqual(type(factory.dbCache), type(dict())) def test_processPatientListInput(self): """Test processPatientListInput().""" # Verify FeatureMatrixFactory throws Error if patientListInput # has not been set. with self.assertRaises(ValueError): self.factory.processPatientListInput() # Initialize DB cursor. cursor = self.connection.cursor() # Build SQL query for list of patients. patientListQuery = SQLQuery() patientListQuery.addSelect("CAST(pat_id AS bigint)") patientListQuery.addFrom("stride_order_proc") patientListQuery.addWhere("proc_code = 'LABMETB'") patientListQuery.addGroupBy("pat_id") patientListQuery.addOrderBy("1 ASC") cursor.execute(str(patientListQuery), patientListQuery.params) # Set and process patientListInput. self.factory.setPatientListInput(cursor, "pat_id") self.factory.processPatientListInput() resultPatientIterator = self.factory.getPatientListIterator() # Verify results. expectedPatientList = ["-789", "-456", "-123"] for expectedPatientId in expectedPatientList: resultPatientId = resultPatientIterator.next()['pat_id'] self.assertEqual(resultPatientId, expectedPatientId) resultPatientIterator.close() # Build TSV file for list of patients. patientList = \ "patient_item_id\tpatient_id\tclinical_item_id\titem_date\n\ -1000\t-123\t-100\t10/6/2113 10:20\n\ -2000\t-123\t-200\t10/6/2113 11:20\n\ -2500\t-123\t-100\t10/7/2113 11:20\n\ -3000\t-456\t-100\t11/6/2113 10:20\n\ -6000\t-789\t-200\t12/6/2113 11:20\n" patientListTsv = open("patient_list.tsv", "w") patientListTsv.write(patientList) patientListTsv.close() # Initialize new FeatureMatrixFactory. self.factory = FeatureMatrixFactory() # Set and process patientListInput. patientListTsv = open("patient_list.tsv", "r") self.factory.setPatientListInput(patientListTsv, "patient_id") self.factory.processPatientListInput() resultPatientIterator = self.factory.getPatientListIterator() # Verify results. expectedPatientList = ["-123", "-123", "-123", "-456", "-789"] for expectedPatientId in expectedPatientList: resultPatientId = resultPatientIterator.next()['patient_id'] self.assertEqual(resultPatientId, expectedPatientId) patientListTsv.close() resultPatientIterator.close() # Clean up patient_list. try: os.remove("patient_list.tsv") os.remove("fmf.patient_list.tsv") except OSError: pass def test_buildFeatureMatrix_multiClinicalItem(self): # Verify FeatureMatrixFactory throws Error if patientEpisodeInput # has not been set. with self.assertRaises(ValueError): self.factory.processPatientEpisodeInput() # Initialize DB cursor. cursor = self.connection.cursor() # Build SQL query for list of patient episodes. patientEpisodeQuery = SQLQuery() patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)") patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id") patientEpisodeQuery.addSelect("proc_code") patientEpisodeQuery.addSelect("order_time") patientEpisodeQuery.addSelect( "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results" ) patientEpisodeQuery.addFrom("stride_order_proc AS sop") patientEpisodeQuery.addFrom("stride_order_results AS sor") patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id") patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB") patientEpisodeQuery.addGroupBy( "pat_id, sop.order_proc_id, proc_code, order_time") patientEpisodeQuery.addOrderBy( "pat_id, sop.order_proc_id, proc_code, order_time") cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params) # Set and process patientEpisodeInput. self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time") self.factory.processPatientEpisodeInput() resultEpisodeIterator = self.factory.getPatientEpisodeIterator() resultPatientEpisodes = list() for episode in resultEpisodeIterator: episode["pat_id"] = int(episode["pat_id"]) episode["order_time"] = DBUtil.parseDateValue( episode["order_time"]) resultPatientEpisodes.append(episode) # Verify results (note sort order). expectedPatientEpisodes = FM_TEST_OUTPUT[ "test_processPatientEpisodeInput"] self.assertEqualList(resultPatientEpisodes, expectedPatientEpisodes) # Add TestItem100 and TestItem200 clinical item data. self.factory.addClinicalItemFeatures(["TestItem100"]) self.factory.addClinicalItemFeatures(["TestItem200"]) self.factory.buildFeatureMatrix() resultMatrix = self.factory.readFeatureMatrixFile() expectedMatrix = FM_TEST_OUTPUT[ "test_buildFeatureMatrix_multiClinicalItem"] self.assertEqualList(resultMatrix[2:], expectedMatrix) def test_buildFeatureMatrix_prePostFeatures(self): """ Test features parameter in addClinicalItemFeatures which allows client to specify they only want .pre* or .post* columns in feature matrix. """ # Verify FeatureMatrixFactory throws Error if patientEpisodeInput # has not been set. with self.assertRaises(ValueError): self.factory.processPatientEpisodeInput() # Initialize DB cursor. cursor = self.connection.cursor() # Build SQL query for list of patient episodes. patientEpisodeQuery = SQLQuery() patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)") patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id") patientEpisodeQuery.addSelect("proc_code") patientEpisodeQuery.addSelect("order_time") patientEpisodeQuery.addSelect( "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results" ) patientEpisodeQuery.addFrom("stride_order_proc AS sop") patientEpisodeQuery.addFrom("stride_order_results AS sor") patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id") patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB") patientEpisodeQuery.addGroupBy( "pat_id, sop.order_proc_id, proc_code, order_time") patientEpisodeQuery.addOrderBy( "pat_id, sop.order_proc_id, proc_code, order_time") cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params) # Set and process patientEpisodeInput. self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time") self.factory.processPatientEpisodeInput() resultEpisodeIterator = self.factory.getPatientEpisodeIterator() resultPatientEpisodes = list() for episode in resultEpisodeIterator: episode["pat_id"] = int(episode["pat_id"]) episode["order_time"] = DBUtil.parseDateValue( episode["order_time"]) resultPatientEpisodes.append(episode) # Add TestItem100 and TestItem200 clinical item data. self.factory.addClinicalItemFeatures(["TestItem100"], features="pre") self.factory.addClinicalItemFeatures(["TestItem200"], features="post") self.factory.buildFeatureMatrix() resultMatrix = self.factory.readFeatureMatrixFile() expectedMatrix = FM_TEST_OUTPUT[ "test_buildFeatureMatrix_prePostFeatures"] self.assertEqualList(resultMatrix[2:], expectedMatrix) def test_build_FeatureMatrix_multiLabTest(self): """ Test buildFeatureMatrix() and addLabFeatures(). """ # Initialize FeatureMatrixFactory. self.factory = FeatureMatrixFactory() # Verify FeatureMatrixFactory throws Error if patientEpisodeInput # has not been set. with self.assertRaises(ValueError): self.factory.processPatientEpisodeInput() # Initialize DB cursor. cursor = self.connection.cursor() # Build SQL query for list of patient episodes. patientEpisodeQuery = SQLQuery() patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)") patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id") patientEpisodeQuery.addSelect("proc_code") patientEpisodeQuery.addSelect("order_time") patientEpisodeQuery.addSelect( "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results" ) patientEpisodeQuery.addFrom("stride_order_proc AS sop") patientEpisodeQuery.addFrom("stride_order_results AS sor") patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id") patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB") patientEpisodeQuery.addGroupBy( "pat_id, sop.order_proc_id, proc_code, order_time") patientEpisodeQuery.addOrderBy( "pat_id, sop.order_proc_id, proc_code, order_time") cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params) # Set and process patientEpisodeInput. self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time") self.factory.processPatientEpisodeInput() resultEpisodeIterator = self.factory.getPatientEpisodeIterator() resultPatientEpisodes = list() for episode in resultEpisodeIterator: episode["pat_id"] = int(episode["pat_id"]) episode["order_time"] = DBUtil.parseDateValue( episode["order_time"]) resultPatientEpisodes.append(episode) # Verify results (note sort order). expectedPatientEpisodes = FM_TEST_OUTPUT[ "test_processPatientEpisodeInput"] self.assertEqualList(resultPatientEpisodes, expectedPatientEpisodes) # Add TNI and CR lab result data. LAC doesn't exist in data. labBaseNames = ["TNI", "CR", "LAC"] # Look for lab data 90 days before each episode, but never afterself. preTimeDelta = datetime.timedelta(-90) postTimeDelta = datetime.timedelta(0) self.factory.addLabResultFeatures(labBaseNames, False, preTimeDelta, postTimeDelta) self.factory.buildFeatureMatrix() resultMatrix = self.factory.readFeatureMatrixFile() # Verify results. expectedMatrix = FM_TEST_OUTPUT[ "test_buildFeatureMatrix_multiLabTest"]["expectedMatrix"] self.assertEqualTable(expectedMatrix, resultMatrix[2:], precision=5) try: os.remove(self.factory.getMatrixFileName()) except OSError: pass def test_buildFeatureMatrix_multiFlowsheet(self): """ Test buildFeatureMatrix and addFlowsheet. """ # Verify FeatureMatrixFactory throws Error if patientEpisodeInput # has not been set. with self.assertRaises(ValueError): self.factory.processPatientEpisodeInput() # Initialize DB cursor. cursor = self.connection.cursor() # Build SQL query for list of patient episodes. patientEpisodeQuery = SQLQuery() patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)") patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id") patientEpisodeQuery.addSelect("proc_code") patientEpisodeQuery.addSelect("order_time") patientEpisodeQuery.addSelect( "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results" ) patientEpisodeQuery.addFrom("stride_order_proc AS sop") patientEpisodeQuery.addFrom("stride_order_results AS sor") patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id") patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB") patientEpisodeQuery.addGroupBy( "pat_id, sop.order_proc_id, proc_code, order_time") patientEpisodeQuery.addOrderBy( "pat_id, sop.order_proc_id, proc_code, order_time") cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params) # Set and process patientEpisodeInput. self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time") self.factory.processPatientEpisodeInput() resultEpisodeIterator = self.factory.getPatientEpisodeIterator() resultPatientEpisodes = list() for episode in resultEpisodeIterator: episode["pat_id"] = int(episode["pat_id"]) episode["order_time"] = DBUtil.parseDateValue( episode["order_time"]) resultPatientEpisodes.append(episode) # Verify results (note sort order). expectedPatientEpisodes = FM_TEST_OUTPUT[ "test_processPatientEpisodeInput"] self.assertEqualList(resultPatientEpisodes, expectedPatientEpisodes) # Add flowsheet features. flowsheetNames = ["Resp", "FiO2", "Glasgow Coma Scale Score"] # Look for lab data 90 days before each episode, but never afterself. preTimeDelta = datetime.timedelta(-90) postTimeDelta = datetime.timedelta(0) self.factory.addFlowsheetFeatures(flowsheetNames, preTimeDelta, postTimeDelta) self.factory.buildFeatureMatrix() resultMatrix = self.factory.readFeatureMatrixFile() # Verify results. expectedMatrix = FM_TEST_OUTPUT[ "test_buildFeatureMatrix_multiFlowsheet"]["expectedMatrix"] self.assertEqualTable(expectedMatrix, resultMatrix[2:], precision=5) try: os.remove(self.factory.getMatrixFileName()) except OSError: pass def test_addTimeCycleFeatures(self): """ Test .addTimeCycleFeatures() """ # Initialize DB cursor. cursor = self.connection.cursor() # Build SQL query for list of patient episodes. patientEpisodeQuery = SQLQuery() patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)") patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id") patientEpisodeQuery.addSelect("proc_code") patientEpisodeQuery.addSelect("order_time") patientEpisodeQuery.addSelect( "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results" ) patientEpisodeQuery.addFrom("stride_order_proc AS sop") patientEpisodeQuery.addFrom("stride_order_results AS sor") patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id") patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB") patientEpisodeQuery.addGroupBy( "pat_id, sop.order_proc_id, proc_code, order_time") patientEpisodeQuery.addOrderBy( "pat_id, sop.order_proc_id, proc_code, order_time") cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params) # Set and process patientEpisodeInput. self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time") self.factory.processPatientEpisodeInput() # Add time cycle features. self.factory.addTimeCycleFeatures("order_time", "month") self.factory.addTimeCycleFeatures("order_time", "hour") # Verify output. self.factory.buildFeatureMatrix() resultMatrix = self.factory.readFeatureMatrixFile() expectedMatrix = FM_TEST_OUTPUT["test_addTimeCycleFeatures"][ "expectedMatrix"] self.assertEqualTable(expectedMatrix, resultMatrix[2:], precision=5) # Clean up feature matrix. try: os.remove(self.factory.getMatrixFileName()) except OSError: pass def test_loadMapData(self): self.factory = FeatureMatrixFactory() # Depends on external data file reader = self.factory.loadMapData("CharlsonComorbidity-ICD9CM") charlsonByICD9 = dict() for row in reader: charlsonByICD9[row["icd9cm"]] = row["charlson"] self.assertEqual("Dementia", charlsonByICD9["294.1"]) self.assertEqual("Dementia", charlsonByICD9["331.2"]) self.assertEqual("COPD", charlsonByICD9["490"]) self.assertEqual("COPD", charlsonByICD9["416.8"]) self.assertEqual("Malignancy Metastatic", charlsonByICD9["199"]) self.assertEqual("AIDS/HIV", charlsonByICD9["042"]) def test_performance(self): """ Test performance against DataExtractor. """ # Initialize DB cursor. cursor = self.connection.cursor() # Initialize FeatureMatrixFactory. factoryStart = time.time() self.factory = FeatureMatrixFactory() # Build SQL query for list of patient episodes. patientEpisodeQuery = SQLQuery() patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)") patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id") patientEpisodeQuery.addSelect("proc_code") patientEpisodeQuery.addSelect("order_time") patientEpisodeQuery.addSelect( "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results" ) patientEpisodeQuery.addFrom("stride_order_proc AS sop") patientEpisodeQuery.addFrom("stride_order_results AS sor") patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id") patientEpisodeQuery.addWhereIn("proc_code", ["Foo", "Bar", "Baz", "Qux"]) patientEpisodeQuery.addGroupBy( "pat_id, sop.order_proc_id, proc_code, order_time") patientEpisodeQuery.addOrderBy( "pat_id, sop.order_proc_id, proc_code, order_time") cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params) # Set and process patientEpisodeInput. self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time") self.factory.processPatientEpisodeInput() # Look for lab data 90 days before each episode, but never afterself. preTimeDelta = datetime.timedelta(-90) postTimeDelta = datetime.timedelta(0) # Add clinical item features. self.factory.addClinicalItemFeatures(["PerfItem300"]) self.factory.addClinicalItemFeatures(["PerfItem400"]) self.factory.addClinicalItemFeatures(["PerfItem500"]) # Add lab result features. self.factory.addLabResultFeatures(["Foo"], False, preTimeDelta, postTimeDelta) self.factory.addLabResultFeatures(["Bar"], False, preTimeDelta, postTimeDelta) self.factory.addLabResultFeatures(["Baz"], False, preTimeDelta, postTimeDelta) self.factory.addLabResultFeatures(["Qux"], False, preTimeDelta, postTimeDelta) # Add flowsheet features. self.factory.addFlowsheetFeatures(["Perflow"], preTimeDelta, postTimeDelta) # Build matrix. self.factory.buildFeatureMatrix() # Stop timer. factoryStop = time.time() # Initialize DataExtractor. extractorStart = time.time() extractor = DataExtractor() extractor.dataCache = dict() # Initialize output file. outFile = open("extractor.feature_matrix.tab.gz", "w") formatter = TextResultsFormatter(outFile) # Build SQL query for list of patient episodes. patientEpisodeQuery = SQLQuery() patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)") patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id") patientEpisodeQuery.addSelect("proc_code") patientEpisodeQuery.addSelect("order_time") patientEpisodeQuery.addSelect( "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results" ) patientEpisodeQuery.addFrom("stride_order_proc AS sop") patientEpisodeQuery.addFrom("stride_order_results AS sor") patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id") patientEpisodeQuery.addWhereIn("proc_code", ["Foo", "Bar", "Baz", "Qux"]) patientEpisodeQuery.addGroupBy( "pat_id, sop.order_proc_id, proc_code, order_time") patientEpisodeQuery.addOrderBy( "pat_id, sop.order_proc_id, proc_code, order_time") cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params) # Process patient episodes. patientEpisodes = list() row = cursor.fetchone() while row is not None: (pat_id, order_proc_id, proc_code, order_time, normal_results) = row patientEpisode = \ RowItemModel \ ( { "patient_id": pat_id, "order_proc_id": order_proc_id, "proc_code": proc_code, "order_time": order_time, "result_normal_count": normal_results } ) patientEpisodes.append(patientEpisode) row = cursor.fetchone() # Initialize patient data. lastPatientId = None colNames = None patientEpisodeByIndexTime = None # Look for lab data 90 days before each episode, but never afterself. preTimeDelta = datetime.timedelta(-90) postTimeDelta = datetime.timedelta(0) # Populate patient data. tempColNames = \ ["patient_id", "order_proc_id", "proc_code", "order_time", "result_normal_count"] for patientEpisode in patientEpisodes: patientId = patientEpisode["patient_id"] if lastPatientId is not None and lastPatientId != patientId: # New patient ID so start querying for patient specific data and # populating patient episode data. # Clinical Item (PerfItem300) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem300",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Clinical Item (PerfItem400) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem400",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Clinical Item (PerfItem500) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem500",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Lab Result (Foo) labResultTable = extractor.queryLabResults(["Foo"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \ preTimeDelta, postTimeDelta)) # Lab Result (Bar) labResultTable = extractor.queryLabResults(["Bar"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \ preTimeDelta, postTimeDelta)) # Lab Result (Baz) labResultTable = extractor.queryLabResults(["Baz"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \ preTimeDelta, postTimeDelta)) # Lab Result (Qux) labResultTable = extractor.queryLabResults(["Qux"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \ preTimeDelta, postTimeDelta)) # Flowsheet (Perflow) # tempFile = StringIO() # labResultTable = extractor.queryFlowsheet(["Perflow"], [patientId], tempFile) # flowsheetByNameByPatientId = extractor.parseFlowsheetFile(\ # StringIO(tempFile.getvalue())) # tempColNames.extend(extractor.addFlowsheetFeatures_singlePatient(\ # patientEpisodeByIndexTime, flowsheetByNameByPatientId[patientId], \ # ["Perflow"], preTimeDelta, postTimeDelta, tempColNames)) if colNames is None: # First row, print header row colNames = tempColNames formatter.formatTuple(colNames) # Print out patient (episode) data (one row per episode) formatter.formatResultDicts(patientEpisodeByIndexTime.values(), colNames) if lastPatientId is None or lastPatientId != patientId: # Prepare to aggregate patient episode record per patient patientEpisodeByIndexTime = dict() patientEpisodeByIndexTime[ patientEpisode["order_time"]] = patientEpisode lastPatientId = patientId outFile.flush() # Last Iteration patientId = lastPatientId # Clinical Item (PerfItem300) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem300",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Clinical Item (PerfItem400) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem400",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Clinical Item (PerfItem500) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem500",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Lab Result (Foo) labResultTable = extractor.queryLabResults(["Foo"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \ preTimeDelta, postTimeDelta)) # Lab Result (Bar) labResultTable = extractor.queryLabResults(["Bar"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \ preTimeDelta, postTimeDelta)) # Lab Result (Baz) labResultTable = extractor.queryLabResults(["Baz"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \ preTimeDelta, postTimeDelta)) # Lab Result (Qux) labResultTable = extractor.queryLabResults(["Qux"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \ preTimeDelta, postTimeDelta)) formatter.formatResultDicts(patientEpisodeByIndexTime.values(), colNames) # Close file. outFile.close() # Stop timer. extractorStop = time.time() # Compare results. factoryTime = factoryStop - factoryStart extractorTime = extractorStop - extractorStart self.assertTrue(extractorTime > factoryTime) # Clean up feature matrix files. try: os.remove("extractor.feature_matrix.tab.gz") except OSError: pass try: os.remove(self.factory.getMatrixFileName()) except OSError: pass