Python read_sql示例，commonDB.read_sql Python示例

示例#1

0

显示文件

def getQSofaCategorization(hadm_ids = None, subject_ids = None):
    """
    This function uses the quick sepsis organ failure assessment, as defined by the third international
    consensus for sepsis
    TODO: Complete this later
    :param hadm_ids a list of hadm_ids which to apply standard to
    :param subject_ids a list of subject_ids which to apply standard to. if hadm_ids is set, this doesn't do anything
    :return dataframe with hadm_ids as index and a column with sepsis
    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4968574/
    """

    if hadm_ids is None and subject_ids is None:
        hadm_ids = commonDB.read_sql("SELECT hadm_id FROM admissions")
    elif hadm_ids is None:
        hadm_ids = commonDB.read_sql("SELECT hadm_id FROM admissions WHERE subject_id in " \
                                + commonDB.convertListToSQL(subject_ids), conn)
    results = pd.DataFrame()
    #check and see if glasgow coma score is below 15
    alteredMentalVerbal = commonDB.read_sql("SELECT hadm_id FROM chartevents " \
                                            + "WHERE valuenum < 15 and itemid = 227013 and hadm_id in " \
                                            + commonDB.convertListToSQL(hadm_ids))
    increasedRespiration = commonDB.read_sql("SELECT hadm_id FROM chartevents " \
                                            + "WHERE valuenum >= 22 and itemid = 224690 and hadm_id in" \
                                            + commonDB.convertListToSQL(hadm_ids))
    increasedSystolicBP = commonDB.read_sql("SELECT hadm_id FROM chartevents " \
                                            + 'WHERE valuenum <= 100 and itemid in (228152, 220050, 442, 455, 6, 51, 3313, 3317, 3319, 3321, 3323) and hadm_id in ' \
                                            + commonDB.convertListToSQL(hadm_ids))
    #TODO: Complete this later
    return None

示例#2

0

显示文件

def preliminaryCompareTimesICU(num_days=1):
    '''
    This method is another sanity check method similar to preliminaryCompareTimes but uses ICUSTAYS instead of ADMISSIONS
    '''
    icustays = commonDB.read_sql("SELECT * FROM ICUSTAYS", uppercase=False)
    (subjects, times) = listAllMatchedWFSubjects()
    matchedWF = pd.DataFrame({
        "subject_id": subjects,
        "wfStartTime": times
    })  #times is when the start of recording for each wf
    matchedWF["subject_id"] = matchedWF["subject_id"].astype(np.number)
    matchedWF["wfStartTime"] = matchedWF["wfStartTime"].apply(
        preliminaryCompareTimesHelper
    )  #convert weird time format into useful data
    admWfMerge = pd.merge(matchedWF,
                          icustays,
                          left_on="subject_id",
                          right_on="subject_id")
    admWfMerge["timeDiff"] = admWfMerge["wfStartTime"].subtract(
        admWfMerge["intime"])
    admWfMerge = admWfMerge[(admWfMerge["timeDiff"] > pd.Timedelta(0))]
    admWfMerge = admWfMerge[(admWfMerge["timeDiff"] <
                             pd.Timedelta(str(num_days) + " days")
                             )]  #don't consider waveform older than 15 days
    admWfMerge["rawTimeDiff"] = admWfMerge["timeDiff"].astype(np.int64)
    print(pd.Timedelta(admWfMerge["timeDiff"].astype(np.int64).mean()))
    return admWfMerge

示例#3

0

显示文件

文件： waveform_traverser.py 项目： Saqibm128/sepsisProject

    def matchWithHADMID(self, subjectid, p_appended=False, time_error="6 hours"):
        '''
        Goes through each fileDateMap entry to match each key (waveform file) to hospital admission
        time_error is the amount of time before admittime and after DISCHTIME to consider for admissions
        @return a dictionary which matches hadm_id with the waveform file, as well as including an admittime
        '''
        subjectFiles = self.getMultiRecordFiles(subjectid, p_appended)
        fileDateMap = Dict()
        for subjectFile in subjectFiles:
            time = Timestamp(year=int(subjectFile[8:12]), month=int(subjectFile[13:15]), day=int(subjectFile[16:18]), \
                             hour=int(subjectFile[19:21]), minute=int(subjectFile[22:24]))
            fileDateMap[subjectFile] = time
        if p_appended:
            subjectid = subjectid[1:] #drop the p
        #We remove any admissions which are death times
        admissions = read_sql("SELECT HADM_ID, ADMITTIME, DISCHTIME from ADMISSIONS where subject_id = {} and DISCHTIME > ADMITTIME".format(subjectid))
        fileAdmissionMap = Dict()

        for waveform in fileDateMap.keys():
            time = fileDateMap[waveform]
            matching = admissions["HADM_ID"][(admissions["ADMITTIME"] - Timedelta(time_error) < time) & (admissions["DISCHTIME"] + Timedelta(time_error) > time)]
            if (len(matching.values) != 0):
                admittime = admissions[admissions["HADM_ID"] == matching.iloc[0]]["ADMITTIME"].iloc[0]
                fileAdmissionMap[waveform].hadmid = matching.iloc[0] #assume that admissions don't overlap for a single subject id
                #store either the admittime from sql or the start of records for admittimes depending on which was first
                fileAdmissionMap.startTime = time
                if (admittime > time):
                    fileAdmissionMap[waveform].admittime = time
                else:
                    fileAdmissionMap[waveform].admittime = admittime
            else:
                fileAdmissionMap[waveform].hadmid = "NOT FOUND"
        return fileAdmissionMap

示例#4

0

显示文件

文件： record_segments.py 项目： Saqibm128/sepsisProject

    def analyze(self, hadmID):
        data, matching = self.reader.getRecordByHADMID(hadmID)

        admittimeDiff = data.index[0] - matching.iloc[0]['admittime']
        data.index = data.index.map(lambda date: pd.Timestamp(year=date.year, month=date.month, day=date.day, hour=date.hour, minute=date.minute))


        if self.reader.columnsToUse is not None:
            for col in self.reader.columnsToUse:
                data.loc[data[col] < self.variable_ranges["OUTLIER_LOW"][col], col] = np.nan
                data.loc[data[col] > self.variable_ranges["OUTLIER_HIGH"][col], col] = np.nan

        partOfSegment = pd.DataFrame(index=pd.RangeIndex(data.shape[0]), columns=["isSeg", "block"]) #is truee or false, depending on window of data

        #figure out which parts of the data are part of a segment or part of a missing segment
        tempData = data.fillna(-1).reset_index(drop=True) #rolling window has incorrect behavior when analyzing nulls, use -1 as placeholder instead

        partOfSegment.loc[:,'isSeg'] = ~tempData.rolling(self.num_missing_most).apply(lambda win: (win == -1).all()).any(axis=1)
        #edge cases
        if data.shape[0] <= self.num_missing_most:
            partOfSegment.loc[:, 'isSeg'] = False
        else:
            partOfSegment.loc[0:self.num_missing_most, 'isSeg'] = partOfSegment.loc[self.num_missing_most, 'isSeg'] #we can always carry over the result for the first n
        partOfSegment['block'] = (partOfSegment['isSeg'].shift(1) != partOfSegment['isSeg']).astype(int).cumsum()
        segments = partOfSegment.groupby(['isSeg','block']).apply(len)

        dob = read_sql("WITH admit as (SELECT SUBJECT_ID FROM ADMISSIONS WHERE HADM_ID={}) SELECT DOB FROM ADMIT JOIN PATIENTS on ADMIT.SUBJECT_ID = PATIENTS.SUBJECT_ID"\
                       .format(hadmID)).iloc[0,0]

        return (hadmID, segments, admittimeDiff, matching.iloc[0]['admittime'], dob)

示例#5

0

显示文件

文件： demographic_reader.py 项目： Saqibm128/sepsisProject

 def getHADMDem(self, hadmid):
     df = pd.DataFrame(columns=[hadmid])
     dem = read_sql(
         "WITH ADMITDATA as (SELECT GENDER, DOB, PATIENTS.SUBJECT_ID FROM PATIENTS WHERE SUBJECT_ID in (SELECT SUBJECT_ID FROM ADMISSIONS WHERE HADM_ID = {})),  SUBJECTDATA as (SELECT * FROM ADMISSIONS WHERE HADM_ID = {}) SELECT * FROM ADMITDATA JOIN SUBJECTDATA on ADMITDATA.SUBJECT_ID=SUBJECTDATA.SUBJECT_ID"
         .format(hadmid, hadmid))
     temp = df[hadmid].append(
         dem['ETHNICITY'].value_counts().rename(lambda x: "ETHNICITY:" + x))
     temp = temp.append(dem['ADMISSION_TYPE'].value_counts().rename(
         lambda x: "ADMISSION_TYPE:" + x))
     temp = temp.append(
         dem['GENDER'].value_counts().rename(lambda x: "GENDER:" + x))
     df[hadmid] = temp
     df = df.T
     df.loc[hadmid, "AGE"] = dem["ADMITTIME"].iloc[0] - dem["DOB"].iloc[0]
     return df

示例#6

0

显示文件

def matchRecordNameWithHADMID(recordName, time_error='6 hours'):
    '''
    Unlike waveform_traverser, which maps all records under a subject id to a hospital admission at one time,
    this only maps one record to a hadm_id.
    :param recordName string containing subject id and time of record, as per mimic3wdb convention
    :param time_error the amount of leeway we give to matching to a hospital admission
    '''
    subjectid = recordName[1:7]
    admissions = read_sql(
        "SELECT HADM_ID, ADMITTIME, DISCHTIME from ADMISSIONS where subject_id = {} and DISCHTIME > ADMITTIME"
        .format(subjectid))
    print(recordName)
    time = nameToTimestamp(recordName)
    matching = admissions[
        (admissions["ADMITTIME"] - pd.Timedelta(time_error) < time)
        & (admissions["DISCHTIME"] + pd.Timedelta(time_error) > time)]
    if matching["ADMITTIME"].iloc[0] > time:
        admittime = time
    else:
        admittime = matching["ADMITTIME"].iloc[0]
    return matching["HADM_ID"].iloc[0], admittime

示例#7

0

显示文件

    def getRecordByHADMID(self, hadmid, subjectID=None):
        '''
        Records combined by HADMID to form a cohesive history over entire hospital admission
        '''
        # Apparently multiple records for same hadm_id
        if subjectID is None:
            subjectID = read_sql(
                "SELECT SUBJECT_ID FROM ADMISSIONS WHERE HADM_ID = " +
                str(hadmid))['SUBJECT_ID'].iloc[0]

        matching = pd.DataFrame(self.traverser.matchWithHADMID(subjectID))
        matching = matching.T
        #find records from specific subjectID that have same hadmid and order by time of the record
        matching = matching[(matching['hadmid'].astype(str) == hadmid)]
        matching = matching.sort_values(by=['admittime'
                                            ])  #use earliest record as base
        currentDF, fields = self.getRecord(matching.index[0])
        if matching.shape[0] > 1:
            for recName in matching.index[1:]:
                data, fields = self.getRecord(recName)
                currentDF = currentDF.fillna(data)
        return currentDF, matching

示例#8

0

显示文件

文件： split_subjects_into_events.py 项目： Saqibm128/sepsisProject

def extract_multiple_hadmids(hadmids):
    '''
    This file is work in progress attempt to replicate and move dependence away from extract_subjects.py
    Specifically, it calls the database itself, instead of the results of extract_subjects.py
    TODO: add age checking to explicitly exclude pediatric patients
    :param hadmids a list of hospital admissions to call the database on
    :postcondition creates directory structure at new_path that contains events partitioned by hadmid
    '''
    print("beginning extraction", len(hadmids))
    for hadmid in hadmids:
        print(str(hadmid))
        events = commonDB.read_sql("SELECT * FROM chartevents WHERE HADM_ID=" + str(hadmid), uppercase=True)
        events = events.dropna(axis=0, subset=["HADM_ID"], how="any")
        events = preprocessing.map_itemids_to_variables(events, var_map)
        try:
            events = preprocessing.clean_events(events, ranges=ranges)
        except:
            print(events.columns)
            raise BaseException
        for hadm_id in events["HADM_ID"].unique():
            # For every hadm_id we want to separate, clean, find variable ranges, and get key constants out for each hadm
            episode = get_events_for_stay(events, hadm_id)
            timeseries = convert_events_to_timeseries(episode, variables=ranges.index)
            timeseries = add_hours_elapsed_to_events(timeseries)
            if timeseries.shape[0] == 0:
                print(' (no data!)')
                continue
            #hard coded fix, TODO: see why this fails without this first condition
            if timeseries["HEART RATE"].isnull().all() or\
              timeseries["MEAN BLOOD PRESSURE"].isnull().all() or\
              timeseries["SYSTOLIC BLOOD PRESSURE"].isnull().all():
                print("missing key values! skipping hadm_id: ", hadm_id)
                continue
            if not os.path.isdir(os.path.join(new_path, str(int(hadm_id)))):
                os.mkdir(os.path.join(new_path, str(int(hadm_id))))
            timeseries.set_index(["HOURS"], inplace=True)
            timeseries.to_csv(os.path.join(new_path, str(int(hadm_id)), 'episode_timeseries.csv'), index_label='HOURS')
            print("finished:", hadm_id)

示例#9

0

显示文件

def preliminaryCompareTimes(num_days=1):
    '''
    This method is a sanity check method to compare the admittime of patients to the waveform start time.
    It calculates the difference and returns the hospital admissions id that correspond to the waveform start time
    unlike compareAdmitToWF, this DOES NOT PULL WAVEFORM data
    as a result, it should be much faster, but means we miss a lot of possible stats

    TLDR: This function is simpler, faster, less comprehensive version of compareAdmitToWf
    :param num_days the max number of days the differnece between the admittime and
            the wfStartTime has to be to be considered allowed, default= 15
    :return a dataframe to characterize results
    '''
    admissions = commonDB.read_sql(
        "SELECT subject_id, admittime FROM admissions", uppercase=False)
    (subjects, times) = listAllMatchedWFSubjects()
    matchedWF = pd.DataFrame({
        "subject_id": subjects,
        "wfStartTime": times
    })  #times is when the start of recording for each wf
    matchedWF["subject_id"] = matchedWF["subject_id"].astype(np.number)
    matchedWF["wfStartTime"] = matchedWF["wfStartTime"].apply(
        preliminaryCompareTimesHelper
    )  #convert weird time format into useful data
    print(admissions.columns, matchedWF.columns)
    admWfMerge = pd.merge(matchedWF,
                          admissions,
                          left_on="subject_id",
                          right_on="subject_id")
    admWfMerge["timeDiff"] = admWfMerge["wfStartTime"].subtract(
        admWfMerge["admittime"])
    admWfMerge = admWfMerge[(admWfMerge["timeDiff"] > pd.Timedelta(0))]
    admWfMerge = admWfMerge[(admWfMerge["timeDiff"] <
                             pd.Timedelta(str(num_days) + " days")
                             )]  #don't consider waveform older than 15 days
    admWfMerge["rawTimeDiff"] = admWfMerge["timeDiff"].astype(np.int64)
    print(admWfMerge.shape)
    print(pd.Timedelta(admWfMerge["timeDiff"].astype(np.int64).mean()))
    return admWfMerge