def getQSofaCategorization(hadm_ids = None, subject_ids = None): """ This function uses the quick sepsis organ failure assessment, as defined by the third international consensus for sepsis TODO: Complete this later :param hadm_ids a list of hadm_ids which to apply standard to :param subject_ids a list of subject_ids which to apply standard to. if hadm_ids is set, this doesn't do anything :return dataframe with hadm_ids as index and a column with sepsis https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4968574/ """ if hadm_ids is None and subject_ids is None: hadm_ids = commonDB.read_sql("SELECT hadm_id FROM admissions") elif hadm_ids is None: hadm_ids = commonDB.read_sql("SELECT hadm_id FROM admissions WHERE subject_id in " \ + commonDB.convertListToSQL(subject_ids), conn) results = pd.DataFrame() #check and see if glasgow coma score is below 15 alteredMentalVerbal = commonDB.read_sql("SELECT hadm_id FROM chartevents " \ + "WHERE valuenum < 15 and itemid = 227013 and hadm_id in " \ + commonDB.convertListToSQL(hadm_ids)) increasedRespiration = commonDB.read_sql("SELECT hadm_id FROM chartevents " \ + "WHERE valuenum >= 22 and itemid = 224690 and hadm_id in" \ + commonDB.convertListToSQL(hadm_ids)) increasedSystolicBP = commonDB.read_sql("SELECT hadm_id FROM chartevents " \ + 'WHERE valuenum <= 100 and itemid in (228152, 220050, 442, 455, 6, 51, 3313, 3317, 3319, 3321, 3323) and hadm_id in ' \ + commonDB.convertListToSQL(hadm_ids)) #TODO: Complete this later return None
def preliminaryCompareTimesICU(num_days=1): ''' This method is another sanity check method similar to preliminaryCompareTimes but uses ICUSTAYS instead of ADMISSIONS ''' icustays = commonDB.read_sql("SELECT * FROM ICUSTAYS", uppercase=False) (subjects, times) = listAllMatchedWFSubjects() matchedWF = pd.DataFrame({ "subject_id": subjects, "wfStartTime": times }) #times is when the start of recording for each wf matchedWF["subject_id"] = matchedWF["subject_id"].astype(np.number) matchedWF["wfStartTime"] = matchedWF["wfStartTime"].apply( preliminaryCompareTimesHelper ) #convert weird time format into useful data admWfMerge = pd.merge(matchedWF, icustays, left_on="subject_id", right_on="subject_id") admWfMerge["timeDiff"] = admWfMerge["wfStartTime"].subtract( admWfMerge["intime"]) admWfMerge = admWfMerge[(admWfMerge["timeDiff"] > pd.Timedelta(0))] admWfMerge = admWfMerge[(admWfMerge["timeDiff"] < pd.Timedelta(str(num_days) + " days") )] #don't consider waveform older than 15 days admWfMerge["rawTimeDiff"] = admWfMerge["timeDiff"].astype(np.int64) print(pd.Timedelta(admWfMerge["timeDiff"].astype(np.int64).mean())) return admWfMerge
def matchWithHADMID(self, subjectid, p_appended=False, time_error="6 hours"): ''' Goes through each fileDateMap entry to match each key (waveform file) to hospital admission time_error is the amount of time before admittime and after DISCHTIME to consider for admissions @return a dictionary which matches hadm_id with the waveform file, as well as including an admittime ''' subjectFiles = self.getMultiRecordFiles(subjectid, p_appended) fileDateMap = Dict() for subjectFile in subjectFiles: time = Timestamp(year=int(subjectFile[8:12]), month=int(subjectFile[13:15]), day=int(subjectFile[16:18]), \ hour=int(subjectFile[19:21]), minute=int(subjectFile[22:24])) fileDateMap[subjectFile] = time if p_appended: subjectid = subjectid[1:] #drop the p #We remove any admissions which are death times admissions = read_sql("SELECT HADM_ID, ADMITTIME, DISCHTIME from ADMISSIONS where subject_id = {} and DISCHTIME > ADMITTIME".format(subjectid)) fileAdmissionMap = Dict() for waveform in fileDateMap.keys(): time = fileDateMap[waveform] matching = admissions["HADM_ID"][(admissions["ADMITTIME"] - Timedelta(time_error) < time) & (admissions["DISCHTIME"] + Timedelta(time_error) > time)] if (len(matching.values) != 0): admittime = admissions[admissions["HADM_ID"] == matching.iloc[0]]["ADMITTIME"].iloc[0] fileAdmissionMap[waveform].hadmid = matching.iloc[0] #assume that admissions don't overlap for a single subject id #store either the admittime from sql or the start of records for admittimes depending on which was first fileAdmissionMap.startTime = time if (admittime > time): fileAdmissionMap[waveform].admittime = time else: fileAdmissionMap[waveform].admittime = admittime else: fileAdmissionMap[waveform].hadmid = "NOT FOUND" return fileAdmissionMap
def analyze(self, hadmID): data, matching = self.reader.getRecordByHADMID(hadmID) admittimeDiff = data.index[0] - matching.iloc[0]['admittime'] data.index = data.index.map(lambda date: pd.Timestamp(year=date.year, month=date.month, day=date.day, hour=date.hour, minute=date.minute)) if self.reader.columnsToUse is not None: for col in self.reader.columnsToUse: data.loc[data[col] < self.variable_ranges["OUTLIER_LOW"][col], col] = np.nan data.loc[data[col] > self.variable_ranges["OUTLIER_HIGH"][col], col] = np.nan partOfSegment = pd.DataFrame(index=pd.RangeIndex(data.shape[0]), columns=["isSeg", "block"]) #is truee or false, depending on window of data #figure out which parts of the data are part of a segment or part of a missing segment tempData = data.fillna(-1).reset_index(drop=True) #rolling window has incorrect behavior when analyzing nulls, use -1 as placeholder instead partOfSegment.loc[:,'isSeg'] = ~tempData.rolling(self.num_missing_most).apply(lambda win: (win == -1).all()).any(axis=1) #edge cases if data.shape[0] <= self.num_missing_most: partOfSegment.loc[:, 'isSeg'] = False else: partOfSegment.loc[0:self.num_missing_most, 'isSeg'] = partOfSegment.loc[self.num_missing_most, 'isSeg'] #we can always carry over the result for the first n partOfSegment['block'] = (partOfSegment['isSeg'].shift(1) != partOfSegment['isSeg']).astype(int).cumsum() segments = partOfSegment.groupby(['isSeg','block']).apply(len) dob = read_sql("WITH admit as (SELECT SUBJECT_ID FROM ADMISSIONS WHERE HADM_ID={}) SELECT DOB FROM ADMIT JOIN PATIENTS on ADMIT.SUBJECT_ID = PATIENTS.SUBJECT_ID"\ .format(hadmID)).iloc[0,0] return (hadmID, segments, admittimeDiff, matching.iloc[0]['admittime'], dob)
def getHADMDem(self, hadmid): df = pd.DataFrame(columns=[hadmid]) dem = read_sql( "WITH ADMITDATA as (SELECT GENDER, DOB, PATIENTS.SUBJECT_ID FROM PATIENTS WHERE SUBJECT_ID in (SELECT SUBJECT_ID FROM ADMISSIONS WHERE HADM_ID = {})), SUBJECTDATA as (SELECT * FROM ADMISSIONS WHERE HADM_ID = {}) SELECT * FROM ADMITDATA JOIN SUBJECTDATA on ADMITDATA.SUBJECT_ID=SUBJECTDATA.SUBJECT_ID" .format(hadmid, hadmid)) temp = df[hadmid].append( dem['ETHNICITY'].value_counts().rename(lambda x: "ETHNICITY:" + x)) temp = temp.append(dem['ADMISSION_TYPE'].value_counts().rename( lambda x: "ADMISSION_TYPE:" + x)) temp = temp.append( dem['GENDER'].value_counts().rename(lambda x: "GENDER:" + x)) df[hadmid] = temp df = df.T df.loc[hadmid, "AGE"] = dem["ADMITTIME"].iloc[0] - dem["DOB"].iloc[0] return df
def matchRecordNameWithHADMID(recordName, time_error='6 hours'): ''' Unlike waveform_traverser, which maps all records under a subject id to a hospital admission at one time, this only maps one record to a hadm_id. :param recordName string containing subject id and time of record, as per mimic3wdb convention :param time_error the amount of leeway we give to matching to a hospital admission ''' subjectid = recordName[1:7] admissions = read_sql( "SELECT HADM_ID, ADMITTIME, DISCHTIME from ADMISSIONS where subject_id = {} and DISCHTIME > ADMITTIME" .format(subjectid)) print(recordName) time = nameToTimestamp(recordName) matching = admissions[ (admissions["ADMITTIME"] - pd.Timedelta(time_error) < time) & (admissions["DISCHTIME"] + pd.Timedelta(time_error) > time)] if matching["ADMITTIME"].iloc[0] > time: admittime = time else: admittime = matching["ADMITTIME"].iloc[0] return matching["HADM_ID"].iloc[0], admittime
def getRecordByHADMID(self, hadmid, subjectID=None): ''' Records combined by HADMID to form a cohesive history over entire hospital admission ''' # Apparently multiple records for same hadm_id if subjectID is None: subjectID = read_sql( "SELECT SUBJECT_ID FROM ADMISSIONS WHERE HADM_ID = " + str(hadmid))['SUBJECT_ID'].iloc[0] matching = pd.DataFrame(self.traverser.matchWithHADMID(subjectID)) matching = matching.T #find records from specific subjectID that have same hadmid and order by time of the record matching = matching[(matching['hadmid'].astype(str) == hadmid)] matching = matching.sort_values(by=['admittime' ]) #use earliest record as base currentDF, fields = self.getRecord(matching.index[0]) if matching.shape[0] > 1: for recName in matching.index[1:]: data, fields = self.getRecord(recName) currentDF = currentDF.fillna(data) return currentDF, matching
def extract_multiple_hadmids(hadmids): ''' This file is work in progress attempt to replicate and move dependence away from extract_subjects.py Specifically, it calls the database itself, instead of the results of extract_subjects.py TODO: add age checking to explicitly exclude pediatric patients :param hadmids a list of hospital admissions to call the database on :postcondition creates directory structure at new_path that contains events partitioned by hadmid ''' print("beginning extraction", len(hadmids)) for hadmid in hadmids: print(str(hadmid)) events = commonDB.read_sql("SELECT * FROM chartevents WHERE HADM_ID=" + str(hadmid), uppercase=True) events = events.dropna(axis=0, subset=["HADM_ID"], how="any") events = preprocessing.map_itemids_to_variables(events, var_map) try: events = preprocessing.clean_events(events, ranges=ranges) except: print(events.columns) raise BaseException for hadm_id in events["HADM_ID"].unique(): # For every hadm_id we want to separate, clean, find variable ranges, and get key constants out for each hadm episode = get_events_for_stay(events, hadm_id) timeseries = convert_events_to_timeseries(episode, variables=ranges.index) timeseries = add_hours_elapsed_to_events(timeseries) if timeseries.shape[0] == 0: print(' (no data!)') continue #hard coded fix, TODO: see why this fails without this first condition if timeseries["HEART RATE"].isnull().all() or\ timeseries["MEAN BLOOD PRESSURE"].isnull().all() or\ timeseries["SYSTOLIC BLOOD PRESSURE"].isnull().all(): print("missing key values! skipping hadm_id: ", hadm_id) continue if not os.path.isdir(os.path.join(new_path, str(int(hadm_id)))): os.mkdir(os.path.join(new_path, str(int(hadm_id)))) timeseries.set_index(["HOURS"], inplace=True) timeseries.to_csv(os.path.join(new_path, str(int(hadm_id)), 'episode_timeseries.csv'), index_label='HOURS') print("finished:", hadm_id)
def preliminaryCompareTimes(num_days=1): ''' This method is a sanity check method to compare the admittime of patients to the waveform start time. It calculates the difference and returns the hospital admissions id that correspond to the waveform start time unlike compareAdmitToWF, this DOES NOT PULL WAVEFORM data as a result, it should be much faster, but means we miss a lot of possible stats TLDR: This function is simpler, faster, less comprehensive version of compareAdmitToWf :param num_days the max number of days the differnece between the admittime and the wfStartTime has to be to be considered allowed, default= 15 :return a dataframe to characterize results ''' admissions = commonDB.read_sql( "SELECT subject_id, admittime FROM admissions", uppercase=False) (subjects, times) = listAllMatchedWFSubjects() matchedWF = pd.DataFrame({ "subject_id": subjects, "wfStartTime": times }) #times is when the start of recording for each wf matchedWF["subject_id"] = matchedWF["subject_id"].astype(np.number) matchedWF["wfStartTime"] = matchedWF["wfStartTime"].apply( preliminaryCompareTimesHelper ) #convert weird time format into useful data print(admissions.columns, matchedWF.columns) admWfMerge = pd.merge(matchedWF, admissions, left_on="subject_id", right_on="subject_id") admWfMerge["timeDiff"] = admWfMerge["wfStartTime"].subtract( admWfMerge["admittime"]) admWfMerge = admWfMerge[(admWfMerge["timeDiff"] > pd.Timedelta(0))] admWfMerge = admWfMerge[(admWfMerge["timeDiff"] < pd.Timedelta(str(num_days) + " days") )] #don't consider waveform older than 15 days admWfMerge["rawTimeDiff"] = admWfMerge["timeDiff"].astype(np.int64) print(admWfMerge.shape) print(pd.Timedelta(admWfMerge["timeDiff"].astype(np.int64).mean())) return admWfMerge