Пример #1
0
def filter_events(events, indx_date, deliverables_path):
    '''
    1. Join indx_date with events on patient_id
    2. Filter events occuring in the observation window(IndexDate-2000 to IndexDate)
    
    '''
    """
    print indx_date

    def date_window(x, no_days=2000):
        return x - timedelta(days=no_days)
    
    print indx_date
    indx_date['pre'] = indx_date['indx_date'].apply(date_window)
    print indx_date['pre']
    filtered_events = pd.merge(events, indx_date, on='patient_id', how='outer')
    filtered_events = filtered_events[(filtered_events.timestamp >= filtered_events.pre) & (filtered_events.timestamp <= filtered_events.indx_date)]
    """
    merged = pd.merge(events, indx_date, how='outer', on='patient_id')

    merged['diff'] = merged.apply(
        lambda x: (x['indx_date'] - date_convert(x['timestamp'])).days, axis=1)
    filtered = merged[(merged['diff'] <= 2000) & (merged['diff'] >= 0)]

    filtered_events = filtered[['patient_id', 'event_id', 'value']]
    filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv',
                           columns=['patient_id', 'event_id', 'value'],
                           index=False)
    #print events.index.size,filtered_events.index.size"""
    return filtered_events
Пример #2
0
def filter_events(events, indx_date, deliverables_path):
    
    '''
    TODO: This function needs to be completed.

    Refer to instructions in Q3 b

    Suggested steps:
    1. Join indx_date with events on patient_id
    2. Filter events occuring in the observation window(IndexDate-2000 to IndexDate)
    
    
    
    IMPORTANT:
    Save filtered_events to a csv file in the deliverables folder named as etl_filtered_events.csv. 
    Use the global variable deliverables_path while specifying the filepath. 
    Each row is of the form patient_id, event_id, value.
    The csv file should have a header 
    For example if you are using Pandas, you could write: 
        filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False)

    Return filtered_events
    '''
    
    indx_date["lower_bound"] = indx_date.indx_date.apply(lambda x: 
        (x - timedelta(days=2000)))
        
    events["timestamp"] = events.timestamp.apply(lambda x: utils.date_convert(x))
    #events["timestamp"] = events["timestamp"].apply(dateutil.parser.parse, dayfirst=False)
    
    joint_events = pd.merge(events, indx_date, on='patient_id')
    filtered_events = joint_events[(joint_events["timestamp"] <= joint_events["indx_date"]) & (joint_events["timestamp"] >= joint_events["lower_bound"])]
    
    filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False)
    return filtered_events
def filter_events(events, indx_date, deliverables_path):
    
    '''
    TODO: This function needs to be completed.

    Refer to instructions in Q3 a

    Suggested steps:
    1. Join indx_date with events on patient_id
    2. Filter events occuring in the observation window(IndexDate-2000 to IndexDate)
    
    
    IMPORTANT:
    Save filtered_events to a csv file in the deliverables folder named as etl_filtered_events.csv. 
    Use the global variable deliverables_path while specifying the filepath. 
    Each row is of the form patient_id, event_id, value.
    The csv file should have a header 
    For example if you are using Pandas, you could write: 
        filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False)

    Return filtered_events
    '''

    filtered_events = pd.merge(events, indx_date, on='patient_id')
    
    filtered_events['target'] = filtered_events.timestamp.apply(lambda x: utils.date_convert(x))
    filtered_events['lower_bound'] = filtered_events.indx_date.apply(
        lambda x: x - timedelta(days=2000))
    filtered_events = filtered_events[(filtered_events['target'] <= filtered_events['indx_date'])
     & (filtered_events['target'] >= filtered_events['lower_bound'])]
   
    filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', 
        columns=['patient_id', 'event_id', 'value'], index=False)
  
    return filtered_events
Пример #4
0
def record_length_metrics(events, mortality):
    '''
    Record length is the duration between the first event and the last event for a given patient. 
    This function needs to be completed.
    '''
    #events.timestamp.apply(dateutil.parser.parse)

    event_dead = events[events['patient_id'].isin(mortality['patient_id'])]
    event_alive = events[~events['patient_id'].isin(mortality['patient_id'])]

    #event_dead['timestamp_y']=pd.datetime.strptime(event_dead['timestamp_y'],'%Y-%m-%d')

    dead_grouped = event_dead.sort('timestamp').groupby(['patient_id'])
    dead = dead_grouped.first().reset_index()[['patient_id', 'timestamp']]

    dead['last'] = dead_grouped.last().reset_index()['timestamp']

    dead['diff'] = dead.apply(
        lambda x:
        (date_convert(x['last']) - date_convert(x['timestamp'])).days,
        axis=1)

    alive_grouped = event_alive.sort('timestamp').groupby(['patient_id'])
    alive = alive_grouped.first().reset_index()[['patient_id', 'timestamp']]

    alive['last'] = alive_grouped.last().reset_index()['timestamp']

    alive['diff'] = alive.apply(
        lambda x:
        (date_convert(x['last']) - date_convert(x['timestamp'])).days,
        axis=1)

    avg_dead_rec_len = dead['diff'].mean()

    max_dead_rec_len = dead['diff'].max()

    min_dead_rec_len = dead['diff'].min()

    avg_alive_rec_len = alive['diff'].mean()

    max_alive_rec_len = alive['diff'].max()

    min_alive_rec_len = alive['diff'].min()

    return min_dead_rec_len, max_dead_rec_len, avg_dead_rec_len, min_alive_rec_len, max_alive_rec_len, avg_alive_rec_len
Пример #5
0
def filter_events(events, indx_date, deliverables_path):
    
    '''
    TODO: This function needs to be completed.

    Refer to instructions in Q3 a

    Suggested steps:
    1. Join indx_date with events on patient_id
    2. Filter events occuring in the observation window(IndexDate-2000 to IndexDate)
    
    
    IMPORTANT:
    Save filtered_events to a csv file in the deliverables folder named as etl_filtered_events.csv. 
    Use the global variable deliverables_path while specifying the filepath. 
    Each row is of the form patient_id, event_id, value.
    The csv file should have a header 
    For example if you are using Pandas, you could write: 
        filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False)

    Return filtered_events
    '''
    
    #indx_date_day=indx_date['indx_date'].apply(lambda x: x.days)
    A=pd.merge(events,indx_date, how='outer',on='patient_id') 
    
    #for line in A:
        
    #[x for x in X if P(x)]
    #list=A[A.indx_date for x in X if utils.date_convert(x)>A.timestamp>utils.date_convert(A.indx_date)-timedelta(days=2000)]
   
    #filtered_events = pd.DataFrame()

    #C=[]
    #for r in zip(A['patient_id'], A['timestamp'], A['indx_date'],A['event_id'],A['value']):
    #    if r[2]-timedelta(days=2000)<utils.date_convert(r[1]).date()<r[2]:
    #        filtered_events = filtered_events.append({'patient_id': r[0], 'event_id': r[3], 'value': r[4]},ignore_index=True)
            #C.append((r[0], r[2]))
            
   
    A.timestamp=A.timestamp.apply(lambda x: utils.date_convert(x).date())
    filtered_events=A[(A['indx_date'] > A['timestamp']) & (A['timestamp'] > A['indx_date']-timedelta(days=2000))]
    
    
    #A["filter"] = A.apply(lambda row: 1 if row['indx_date'] > row['timestamp'] > row['indx_date']-timedelta(days=2000) else np.nan,axis=1)
    
    #A['timestamp']=A.apply(lambda row: utils.date_convert(row['timestamp']).date(),axis=1)
    #A["filter"] = A.apply(lambda row: row['indx_date'] if row['indx_date'] > row['timestamp'] > row['indx_date']-timedelta(days=2000) else np.nan,axis=1)
  
   
    filtered_events.sort(['patient_id'], ascending=[True])
    filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False)
    return filtered_events
Пример #6
0
def calculate_index_date(events, mortality, deliverables_path):
    
    '''
    TODO: This function needs to be completed.

    Refer to instructions in Q3 a

    Suggested steps:
    1. Create list of patients alive ( mortality_events.csv only contains information about patients deceased)
    2. Split events into two groups based on whether the patient is alive or deceased
    3. Calculate index date for each patient
    
    IMPORTANT:
    Save indx_date to a csv file in the deliverables folder named as etl_index_dates.csv. 
    Use the global variable deliverables_path while specifying the filepath. 
    Each row is of the form patient_id, indx_date.
    The csv file should have a header 
    For example if you are using Pandas, you could write: 
        indx_date.to_csv(deliverables_path + 'etl_index_dates.csv', columns=['patient_id', 'indx_date'], index=False)

    Return indx_date
    '''
    
    patients = np.unique(events["patient_id"].values.ravel())
    dead_patients = np.unique(mortality["patient_id"].values.ravel())
    alive_patients = np.setdiff1d(patients, dead_patients)
    
    alive_events = events.loc[events["patient_id"].isin(alive_patients)]
    
    indx_alive_dates = alive_events.groupby(["patient_id"], as_index=False)[["timestamp"]].max()
    
    indx_alive_dates["timestamp"] = indx_alive_dates.timestamp.apply(lambda x: utils.date_convert(x))
    #indx_alive_dates["timestamp"] = indx_alive_dates["timestamp"].apply(dateutil.parser.parse, dayfirst=False)
    indx_alive_dates["timestamp"] = indx_alive_dates["timestamp"].apply(lambda x: x.strftime('%Y-%m-%d'))
    
    indx_dead_dates = pd.DataFrame()
    indx_dead_dates["patient_id"] = mortality["patient_id"]
    indx_dead_dates["timestamp"] = mortality.timestamp.apply(lambda x: 
        (datetime.strptime(x, "%Y-%m-%d") - timedelta(days=30)).strftime("%Y-%m-%d"))
    
    
    indx_date = pd.concat([indx_alive_dates, indx_dead_dates]).reset_index().drop('index', 1)
    indx_date.columns = ["patient_id", "indx_date"]
    indx_date.to_csv(deliverables_path + 'etl_index_dates.csv', index=False)
    
    indx_date['indx_date'] = indx_date.indx_date.apply(lambda x:
     datetime.strptime(x, "%Y-%m-%d"))
    return indx_date
Пример #7
0
def calculate_index_date(events, mortality, deliverables_path):
    
    '''
    TODO: This function needs to be completed.
    
    

    Refer to instructions in Q3 a

    Suggested steps:
    1. Create list of patients alive ( mortality_events.csv only contains information about patients deceased)
    2. Split events into two groups based on whether the patient is alive or deceased
    3. Calculate index date for each patient
    
    IMPORTANT:
    Save indx_date to a csv file in the deliverables folder named as etl_index_dates.csv. 
    Use the global variable deliverables_path while specifying the filepath. 
    Each row is of the form patient_id, indx_date.
    The csv file should have a header 
    For example if you are using Pandas, you could write: 
        indx_date.to_csv(deliverables_path + 'etl_index_dates.csv', columns=['patient_id', 'indx_date'], index=False)

    Return indx_date
    '''
    A=pd.merge(events,mortality, how='outer',on='patient_id') 
    list_dead=A[A.label==1]
    list_alive=A[A.label<>1]
    #print list_alive
    #print list_alive 
    event_alive=list_alive.groupby('patient_id')['timestamp_x'].apply(lambda x: utils.date_convert(x.max()).date())
    #print event_alive
    event_dead=list_dead.groupby('patient_id')['timestamp_y'].apply(lambda x: utils.date_offset(x.max(),-30).date())
    #print event_dead
    
    df_alive=pd.DataFrame({'patient_id':event_alive.index, 'indx_date':event_alive.values})
    df_dead=pd.DataFrame({'patient_id':event_dead.index, 'indx_date':event_dead.values})
    indx_date = pd.concat([df_alive,df_dead])
    
    indx_date.sort(['patient_id','indx_date'], ascending=[True,True])
    indx_date.to_csv(deliverables_path + 'etl_index_dates.csv', columns=['patient_id', 'indx_date'], index=False)
    #print indx_date
    return indx_date
Пример #8
0
def filter_events(events, indx_date, deliverables_path):
    
    '''
    
    1. Join indx_date with events on patient_id
    2. Filter events occuring in the observation window(IndexDate-2000 to IndexDate)
    
    
    IMPORTANT:
    Save filtered_events to a csv file in the deliverables folder named as etl_filtered_events.csv. 
    Use the global variable deliverables_path while specifying the filepath. 
    Each row is of the form patient_id, event_id, value.
    The csv file should have a header 
    For example if you are using Pandas, you could write: 
        filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False)

    Return filtered_events
    '''
    """
    print indx_date

    def date_window(x, no_days=2000):
        return x - timedelta(days=no_days)
    
    print indx_date
    indx_date['pre'] = indx_date['indx_date'].apply(date_window)
    print indx_date['pre']
    filtered_events = pd.merge(events, indx_date, on='patient_id', how='outer')
    filtered_events = filtered_events[(filtered_events.timestamp >= filtered_events.pre) & (filtered_events.timestamp <= filtered_events.indx_date)]
    """
    merged=pd.merge(events, indx_date, how='outer',on='patient_id')


    merged['diff']= merged.apply(lambda x: (x['indx_date']-date_convert(x['timestamp'])).days,axis=1)
    filtered=merged[(merged['diff']<=2000) & (merged['diff']>=0)]
    
    filtered_events=filtered[['patient_id','event_id','value']]
    filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False)
    #print events.index.size,filtered_events.index.size"""
    return filtered_events