def filter_events(events, indx_date, deliverables_path): ''' 1. Join indx_date with events on patient_id 2. Filter events occuring in the observation window(IndexDate-2000 to IndexDate) ''' """ print indx_date def date_window(x, no_days=2000): return x - timedelta(days=no_days) print indx_date indx_date['pre'] = indx_date['indx_date'].apply(date_window) print indx_date['pre'] filtered_events = pd.merge(events, indx_date, on='patient_id', how='outer') filtered_events = filtered_events[(filtered_events.timestamp >= filtered_events.pre) & (filtered_events.timestamp <= filtered_events.indx_date)] """ merged = pd.merge(events, indx_date, how='outer', on='patient_id') merged['diff'] = merged.apply( lambda x: (x['indx_date'] - date_convert(x['timestamp'])).days, axis=1) filtered = merged[(merged['diff'] <= 2000) & (merged['diff'] >= 0)] filtered_events = filtered[['patient_id', 'event_id', 'value']] filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False) #print events.index.size,filtered_events.index.size""" return filtered_events
def filter_events(events, indx_date, deliverables_path): ''' TODO: This function needs to be completed. Refer to instructions in Q3 b Suggested steps: 1. Join indx_date with events on patient_id 2. Filter events occuring in the observation window(IndexDate-2000 to IndexDate) IMPORTANT: Save filtered_events to a csv file in the deliverables folder named as etl_filtered_events.csv. Use the global variable deliverables_path while specifying the filepath. Each row is of the form patient_id, event_id, value. The csv file should have a header For example if you are using Pandas, you could write: filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False) Return filtered_events ''' indx_date["lower_bound"] = indx_date.indx_date.apply(lambda x: (x - timedelta(days=2000))) events["timestamp"] = events.timestamp.apply(lambda x: utils.date_convert(x)) #events["timestamp"] = events["timestamp"].apply(dateutil.parser.parse, dayfirst=False) joint_events = pd.merge(events, indx_date, on='patient_id') filtered_events = joint_events[(joint_events["timestamp"] <= joint_events["indx_date"]) & (joint_events["timestamp"] >= joint_events["lower_bound"])] filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False) return filtered_events
def filter_events(events, indx_date, deliverables_path): ''' TODO: This function needs to be completed. Refer to instructions in Q3 a Suggested steps: 1. Join indx_date with events on patient_id 2. Filter events occuring in the observation window(IndexDate-2000 to IndexDate) IMPORTANT: Save filtered_events to a csv file in the deliverables folder named as etl_filtered_events.csv. Use the global variable deliverables_path while specifying the filepath. Each row is of the form patient_id, event_id, value. The csv file should have a header For example if you are using Pandas, you could write: filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False) Return filtered_events ''' filtered_events = pd.merge(events, indx_date, on='patient_id') filtered_events['target'] = filtered_events.timestamp.apply(lambda x: utils.date_convert(x)) filtered_events['lower_bound'] = filtered_events.indx_date.apply( lambda x: x - timedelta(days=2000)) filtered_events = filtered_events[(filtered_events['target'] <= filtered_events['indx_date']) & (filtered_events['target'] >= filtered_events['lower_bound'])] filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False) return filtered_events
def record_length_metrics(events, mortality): ''' Record length is the duration between the first event and the last event for a given patient. This function needs to be completed. ''' #events.timestamp.apply(dateutil.parser.parse) event_dead = events[events['patient_id'].isin(mortality['patient_id'])] event_alive = events[~events['patient_id'].isin(mortality['patient_id'])] #event_dead['timestamp_y']=pd.datetime.strptime(event_dead['timestamp_y'],'%Y-%m-%d') dead_grouped = event_dead.sort('timestamp').groupby(['patient_id']) dead = dead_grouped.first().reset_index()[['patient_id', 'timestamp']] dead['last'] = dead_grouped.last().reset_index()['timestamp'] dead['diff'] = dead.apply( lambda x: (date_convert(x['last']) - date_convert(x['timestamp'])).days, axis=1) alive_grouped = event_alive.sort('timestamp').groupby(['patient_id']) alive = alive_grouped.first().reset_index()[['patient_id', 'timestamp']] alive['last'] = alive_grouped.last().reset_index()['timestamp'] alive['diff'] = alive.apply( lambda x: (date_convert(x['last']) - date_convert(x['timestamp'])).days, axis=1) avg_dead_rec_len = dead['diff'].mean() max_dead_rec_len = dead['diff'].max() min_dead_rec_len = dead['diff'].min() avg_alive_rec_len = alive['diff'].mean() max_alive_rec_len = alive['diff'].max() min_alive_rec_len = alive['diff'].min() return min_dead_rec_len, max_dead_rec_len, avg_dead_rec_len, min_alive_rec_len, max_alive_rec_len, avg_alive_rec_len
def filter_events(events, indx_date, deliverables_path): ''' TODO: This function needs to be completed. Refer to instructions in Q3 a Suggested steps: 1. Join indx_date with events on patient_id 2. Filter events occuring in the observation window(IndexDate-2000 to IndexDate) IMPORTANT: Save filtered_events to a csv file in the deliverables folder named as etl_filtered_events.csv. Use the global variable deliverables_path while specifying the filepath. Each row is of the form patient_id, event_id, value. The csv file should have a header For example if you are using Pandas, you could write: filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False) Return filtered_events ''' #indx_date_day=indx_date['indx_date'].apply(lambda x: x.days) A=pd.merge(events,indx_date, how='outer',on='patient_id') #for line in A: #[x for x in X if P(x)] #list=A[A.indx_date for x in X if utils.date_convert(x)>A.timestamp>utils.date_convert(A.indx_date)-timedelta(days=2000)] #filtered_events = pd.DataFrame() #C=[] #for r in zip(A['patient_id'], A['timestamp'], A['indx_date'],A['event_id'],A['value']): # if r[2]-timedelta(days=2000)<utils.date_convert(r[1]).date()<r[2]: # filtered_events = filtered_events.append({'patient_id': r[0], 'event_id': r[3], 'value': r[4]},ignore_index=True) #C.append((r[0], r[2])) A.timestamp=A.timestamp.apply(lambda x: utils.date_convert(x).date()) filtered_events=A[(A['indx_date'] > A['timestamp']) & (A['timestamp'] > A['indx_date']-timedelta(days=2000))] #A["filter"] = A.apply(lambda row: 1 if row['indx_date'] > row['timestamp'] > row['indx_date']-timedelta(days=2000) else np.nan,axis=1) #A['timestamp']=A.apply(lambda row: utils.date_convert(row['timestamp']).date(),axis=1) #A["filter"] = A.apply(lambda row: row['indx_date'] if row['indx_date'] > row['timestamp'] > row['indx_date']-timedelta(days=2000) else np.nan,axis=1) filtered_events.sort(['patient_id'], ascending=[True]) filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False) return filtered_events
def calculate_index_date(events, mortality, deliverables_path): ''' TODO: This function needs to be completed. Refer to instructions in Q3 a Suggested steps: 1. Create list of patients alive ( mortality_events.csv only contains information about patients deceased) 2. Split events into two groups based on whether the patient is alive or deceased 3. Calculate index date for each patient IMPORTANT: Save indx_date to a csv file in the deliverables folder named as etl_index_dates.csv. Use the global variable deliverables_path while specifying the filepath. Each row is of the form patient_id, indx_date. The csv file should have a header For example if you are using Pandas, you could write: indx_date.to_csv(deliverables_path + 'etl_index_dates.csv', columns=['patient_id', 'indx_date'], index=False) Return indx_date ''' patients = np.unique(events["patient_id"].values.ravel()) dead_patients = np.unique(mortality["patient_id"].values.ravel()) alive_patients = np.setdiff1d(patients, dead_patients) alive_events = events.loc[events["patient_id"].isin(alive_patients)] indx_alive_dates = alive_events.groupby(["patient_id"], as_index=False)[["timestamp"]].max() indx_alive_dates["timestamp"] = indx_alive_dates.timestamp.apply(lambda x: utils.date_convert(x)) #indx_alive_dates["timestamp"] = indx_alive_dates["timestamp"].apply(dateutil.parser.parse, dayfirst=False) indx_alive_dates["timestamp"] = indx_alive_dates["timestamp"].apply(lambda x: x.strftime('%Y-%m-%d')) indx_dead_dates = pd.DataFrame() indx_dead_dates["patient_id"] = mortality["patient_id"] indx_dead_dates["timestamp"] = mortality.timestamp.apply(lambda x: (datetime.strptime(x, "%Y-%m-%d") - timedelta(days=30)).strftime("%Y-%m-%d")) indx_date = pd.concat([indx_alive_dates, indx_dead_dates]).reset_index().drop('index', 1) indx_date.columns = ["patient_id", "indx_date"] indx_date.to_csv(deliverables_path + 'etl_index_dates.csv', index=False) indx_date['indx_date'] = indx_date.indx_date.apply(lambda x: datetime.strptime(x, "%Y-%m-%d")) return indx_date
def calculate_index_date(events, mortality, deliverables_path): ''' TODO: This function needs to be completed. Refer to instructions in Q3 a Suggested steps: 1. Create list of patients alive ( mortality_events.csv only contains information about patients deceased) 2. Split events into two groups based on whether the patient is alive or deceased 3. Calculate index date for each patient IMPORTANT: Save indx_date to a csv file in the deliverables folder named as etl_index_dates.csv. Use the global variable deliverables_path while specifying the filepath. Each row is of the form patient_id, indx_date. The csv file should have a header For example if you are using Pandas, you could write: indx_date.to_csv(deliverables_path + 'etl_index_dates.csv', columns=['patient_id', 'indx_date'], index=False) Return indx_date ''' A=pd.merge(events,mortality, how='outer',on='patient_id') list_dead=A[A.label==1] list_alive=A[A.label<>1] #print list_alive #print list_alive event_alive=list_alive.groupby('patient_id')['timestamp_x'].apply(lambda x: utils.date_convert(x.max()).date()) #print event_alive event_dead=list_dead.groupby('patient_id')['timestamp_y'].apply(lambda x: utils.date_offset(x.max(),-30).date()) #print event_dead df_alive=pd.DataFrame({'patient_id':event_alive.index, 'indx_date':event_alive.values}) df_dead=pd.DataFrame({'patient_id':event_dead.index, 'indx_date':event_dead.values}) indx_date = pd.concat([df_alive,df_dead]) indx_date.sort(['patient_id','indx_date'], ascending=[True,True]) indx_date.to_csv(deliverables_path + 'etl_index_dates.csv', columns=['patient_id', 'indx_date'], index=False) #print indx_date return indx_date
def filter_events(events, indx_date, deliverables_path): ''' 1. Join indx_date with events on patient_id 2. Filter events occuring in the observation window(IndexDate-2000 to IndexDate) IMPORTANT: Save filtered_events to a csv file in the deliverables folder named as etl_filtered_events.csv. Use the global variable deliverables_path while specifying the filepath. Each row is of the form patient_id, event_id, value. The csv file should have a header For example if you are using Pandas, you could write: filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False) Return filtered_events ''' """ print indx_date def date_window(x, no_days=2000): return x - timedelta(days=no_days) print indx_date indx_date['pre'] = indx_date['indx_date'].apply(date_window) print indx_date['pre'] filtered_events = pd.merge(events, indx_date, on='patient_id', how='outer') filtered_events = filtered_events[(filtered_events.timestamp >= filtered_events.pre) & (filtered_events.timestamp <= filtered_events.indx_date)] """ merged=pd.merge(events, indx_date, how='outer',on='patient_id') merged['diff']= merged.apply(lambda x: (x['indx_date']-date_convert(x['timestamp'])).days,axis=1) filtered=merged[(merged['diff']<=2000) & (merged['diff']>=0)] filtered_events=filtered[['patient_id','event_id','value']] filtered_events.to_csv(deliverables_path + 'etl_filtered_events.csv', columns=['patient_id', 'event_id', 'value'], index=False) #print events.index.size,filtered_events.index.size""" return filtered_events