def _get_average_orders_per_patient(self): # Initialize DB cursor. cursor = self._connection.cursor() # Get average number of results for this lab test per patient. query = SQLQuery() if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': #TODO: add STRIDE component routine query.addSelect('CAST(pat_id AS BIGINT) AS pat_id') query.addSelect('COUNT(sop.order_proc_id) AS num_orders') query.addFrom('stride_order_proc AS sop') query.addFrom('stride_order_results AS sor') query.addWhere('sop.order_proc_id = sor.order_proc_id') query.addWhereIn("proc_code", [self._lab_panel]) components = self._get_components_in_lab_panel() query.addWhereIn("base_name", components) query.addGroupBy('pat_id') elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': query.addSelect('CAST(pat_id AS BIGINT) AS pat_id') query.addSelect('COUNT(order_proc_id) AS num_orders') query.addFrom('labs') query.addWhereIn(self._varTypeInTable, [self._lab_var]) components = self._get_components_in_lab_panel() query.addWhereIn("base_name", components) query.addGroupBy('pat_id') log.debug('Querying median orders per patient...') results = DBUtil.execute(query) order_counts = [row[1] for row in results] if len(order_counts) == 0: error_msg = '0 orders for lab "%s."' % self._lab_var log.critical(error_msg) raise Exception(error_msg) # sys.exit('[ERROR] %s' % error_msg) # sxu: sys.exit cannot be caught by Exception else: return numpy.median(order_counts)
def _get_random_patient_list(self): #sx: this function is for avoid RANDOM() on the database cursor = self._connection.cursor() # Get average number of results for this lab test per patient. query = SQLQuery() query.addSelect('pat_id') query.addSelect('COUNT(sop.order_proc_id) AS num_orders') query.addFrom('stride_order_proc AS sop') query.addFrom('stride_order_results AS sor') query.addWhere('sop.order_proc_id = sor.order_proc_id') ## query.addWhereIn("base_name", [self._component]) query.addGroupBy('pat_id') log.debug('Querying median orders per patient...') results = DBUtil.execute(query) order_counts = [ row[1] for row in results ] if len(results) == 0: error_msg = '0 orders for component "%s."' % self._component #sx log.critical(error_msg) sys.exit('[ERROR] %s' % error_msg) else: avg_orders_per_patient = numpy.median(order_counts) log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Some components may have fewer associated patients than the required sample size patient_number_chosen = min([len(results),self._num_patients]) # inds_random_patients = numpy.random.choice(len(results), size=patient_number_chosen, replace=False) # print 'inds_random_patients:', inds_random_patients pat_IDs_random_patients = [] for ind in inds_random_patients: pat_IDs_random_patients.append(results[ind][0]) # print pat_IDs_random_patients return pat_IDs_random_patients
def _get_average_orders_per_patient(self): # Initialize DB cursor. cursor = self._connection.cursor() # Get average number of results for this lab test per patient. query = SQLQuery() query.addSelect('pat_id') query.addSelect('COUNT(sop.order_proc_id) AS num_orders') query.addFrom('stride_order_proc AS sop') query.addFrom('stride_order_results AS sor') query.addWhere('sop.order_proc_id = sor.order_proc_id') query.addWhereIn("proc_code", [self._lab_panel]) components = self._get_components_in_lab_panel() query.addWhereIn("base_name", components) query.addGroupBy('pat_id') log.debug('Querying median orders per patient...') results = DBUtil.execute(query) order_counts = [ row[1] for row in results ] if len(order_counts) == 0: error_msg = '0 orders for lab panel "%s."' % self._lab_panel log.critical(error_msg) sys.exit('[ERROR] %s' % error_msg) else: return numpy.median(order_counts)
def _get_random_patient_list(self): # Initialize DB cursor. cursor = self._connection.cursor() query = SQLQuery() query.addSelect('CAST(pat_id AS BIGINT) AS pat_id') if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': if self._isLabPanel: query.addSelect('COUNT(sop.order_proc_id) AS num_orders') query.addFrom('stride_order_proc AS sop') query.addFrom('stride_order_results AS sor') if self._time_limit: if self._time_limit[0]: query.addWhere("sop.order_time > '%s'" % self._time_limit[0]) if self._time_limit[1]: query.addWhere("sop.order_time < '%s'" % self._time_limit[1]) query.addWhere('sop.order_proc_id = sor.order_proc_id') query.addWhereIn('proc_code', [self._lab_var]) ''' sbala: Technically it's possible for someone to get a lab ordered without getting results ''' query.addWhereIn("base_name", self._lab_components) else: query.addSelect('COUNT(sor.order_proc_id) AS num_orders') query.addFrom('stride_order_proc AS sop') query.addFrom('stride_order_results AS sor') query.addWhere('sop.order_proc_id = sor.order_proc_id') ## query.addWhereIn("base_name", [self._lab_var]) elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': query_str = "SELECT CAST(pat_id AS BIGINT) AS pat_id , " query_str += "COUNT(order_proc_id) AS num_orders " query_str += "FROM labs " # query_str += " WHERE %s IN (%s) "%(self._varTypeInTable, self._lab_var) query_str += "WHERE %s = '%s' " % (self._varTypeInTable, self._lab_var) if self.notUsePatIds: query_str += "AND pat_id NOT IN (" for pat_id in self.notUsePatIds: query_str += "%s," % pat_id query_str = query_str[:-1] + ") " # get rid of comma query_str += "GROUP BY pat_id" log.debug('Querying median orders per patient...') # TODO: best way to integrate UMich code results = DBUtil.execute(query_str) order_counts = [row[1] for row in results] if len(results) == 0: error_msg = '0 orders for order "%s."' % self._lab_var # sx log.critical(error_msg) raise Exception(error_msg) # sys.exit('[ERROR] %s' % error_msg) # sxu: sys.exit cannot be caught by Exception else: avg_orders_per_patient = numpy.median(order_counts) log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Some components may have fewer associated patients than the required sample size patient_number_chosen = min([len(results), self._num_patients]) # inds_random_patients = numpy.random.choice( len(results), size=patient_number_chosen, replace=False) # print 'inds_random_patients:', inds_random_patients pat_IDs_random_patients = [] for ind in inds_random_patients: pat_IDs_random_patients.append(results[ind][0]) return pat_IDs_random_patients else: query.addSelect('COUNT(order_proc_id) AS num_orders') query.addFrom('labs') if self._isLabPanel: query.addWhereIn("proc_code", [self._lab_var]) query.addWhereIn("base_name", self._lab_components) else: query.addWhereIn("base_name", [self._lab_var]) ''' Fo hold-out set, do not use the patients already used in training/validation. ''' if self._notUsePatIds: query.addWhereNotIn('pat_id', self._notUsePatIds) query.addGroupBy('pat_id') log.debug('Querying the number of orders per patient...') results = DBUtil.execute(query) order_counts = [row[1] for row in results] if len(results) == 0: error_msg = '0 orders for component "%s."' % self._lab_var # sx log.critical(error_msg) raise Exception(error_msg) # sys.exit('[ERROR] %s' % error_msg) # sxu: sys.exit cannot be caught by Exception else: avg_orders_per_patient = numpy.median(order_counts) log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Some components may have fewer associated patients than the required sample size patient_number_chosen = min([len(results), self._num_patients]) # ''' Set seed to ensure re-producibility of patient episodes. Recover int random_state here, since numpy requires int while sql requires [-1,1] ''' numpy.random.seed(int(self._random_state * float(sys.maxsize))) inds_random_patients = numpy.random.choice( len(results), size=patient_number_chosen, replace=False) pat_IDs_random_patients = [ results[ind][0] for ind in inds_random_patients ] return pat_IDs_random_patients
def _get_random_patient_list(self): # Initialize DB cursor. cursor = self._connection.cursor() if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': # Get average number of results for this lab test per patient. avg_orders_per_patient = self._get_average_orders_per_patient() log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Get numPatientsToQuery random patients who have gotten test. # TODO(sbala): Have option to feed in a seed for the randomness. query = SQLQuery() query.addSelect('CAST(pat_id AS BIGINT) AS pat_id') query.addFrom('stride_order_proc AS sop') query.addWhereIn('proc_code', [self._lab_var]) # TODO: components query.addOrderBy('RANDOM()') query.setLimit(self._num_patients) log.debug('Querying random patient list...') results = DBUtil.execute(query) # Get patient list. random_patient_list = [row[0] for row in results] return random_patient_list elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': # Get average number of results for this lab test per patient. # query = SQLQuery() # query.addSelect('CAST(pat_id AS BIGINT) AS pat_id') # query.addSelect('COUNT(order_proc_id) AS num_orders') # query.addFrom('labs') # query.addWhereIn(self._varTypeInTable, [self._lab_var]) # components = self._get_components_in_lab_panel() # query.addWhereIn("base_name", components) # # if self.notUsePatIds: # query.addWhereNotIn("pat_id", self.notUsePatIds) # # query.addGroupBy('pat_id') query_str = "SELECT CAST(pat_id AS BIGINT) AS pat_id , " query_str += "COUNT(order_proc_id) AS num_orders " query_str += "FROM labs " #query_str += " WHERE %s IN (%s) "%(self._varTypeInTable, self._lab_var) query_str += "WHERE %s = '%s' " % (self._varTypeInTable, self._lab_var) if self.notUsePatIds: query_str += "AND pat_id NOT IN (" for pat_id in self.notUsePatIds: query_str += "%s," % pat_id query_str = query_str[:-1] + ") " # get rid of comma query_str += "GROUP BY pat_id" log.debug('Querying median orders per patient...') results = DBUtil.execute(query_str) order_counts = [row[1] for row in results] if len(results) == 0: error_msg = '0 orders for order "%s."' % self._lab_var #sx log.critical(error_msg) raise Exception(error_msg) # sys.exit('[ERROR] %s' % error_msg) # sxu: sys.exit cannot be caught by Exception else: avg_orders_per_patient = numpy.median(order_counts) log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Some components may have fewer associated patients than the required sample size patient_number_chosen = min([len(results), self._num_patients]) # inds_random_patients = numpy.random.choice( len(results), size=patient_number_chosen, replace=False) # print 'inds_random_patients:', inds_random_patients pat_IDs_random_patients = [] for ind in inds_random_patients: pat_IDs_random_patients.append(results[ind][0]) return pat_IDs_random_patients