def clean_epic_notes(EPIC, EPIC_cc, notes_cols, data_path, save_path=None, save_index=False): ''' Fill in missing notes by CC and clean notes to fit the BERT format. Input : EPIC = [DataFrame] Data containing notes. EPIC_cc = [DataFrame] Must contain "CC". notes_cols = [list] Column name of the notes to be cleaned. data_path = [str] path to the raw EPIC data (must contain notes). save_path = [str] path to save the cleaned notes. save_index = [bool] whether to save the index of the cleaned notes. Output: EPIC with notes cleaned. ''' # Loop over each file and write to a csv print("\nStart cleaning notes ...") # Clean text for col in notes_cols: print("Cleaning {}".format(col)) EPIC.loc[:, col] = list(map(clean_text, EPIC[col])) # Save data EPIC.to_csv(save_path + 'EPIC_triage.csv', index=False) # Load data nonetheless to convert empty notes "" to nan EPIC = pd.read_csv(save_path + 'EPIC_triage.csv') # Fill in missing vals EPIC = fill_missing_text(EPIC, EPIC_cc, notes_cols) # Save imputed text EPIC.to_csv(save_path + 'EPIC_triage.csv', index=False) # Further preprocessing preprocessor = EPICPreprocess.Preprocess(data_path) _, _, _, EPIC_arrival = preprocessor.streamline() # Remove the obvious outliers EPIC = EPIC.loc[EPIC_arrival.index, :] # Add time variable EPIC = pd.concat([EPIC, EPIC_arrival["Arrived"].astype(int)], axis=1) # Get time span time_span = EPIC['Arrived'].unique().tolist() # Save data if save_path is not None: pickle.dump(time_span, open(save_path + "time_span", "wb")) if not save_index: EPIC.to_csv(save_path + 'EPIC.csv', index=False) else: EPIC.to_csv(save_path + 'EPIC.csv', index=True) return EPIC
drop_cols = [ 'Distance.To.Sick.Kids', 'Distance.To.Walkin', 'Distance.To.Hospital', 'Systolic', 'First.ED.Provider', 'Last.ED.Provider', 'ED.Longest.Attending.ED.Provider' ] drop_cols = ['Name.Of.Walkin', 'Day.of.Arrival'] drop_cols = [ 'First.ED.Provider', 'Last.ED.Provider', 'ED.Longest.Attending.ED.Provider', 'Admitting.Provider', 'Name.Of.Hospital' ] rm_features = ['Systolic', 'Day.of.Arrival_Monday', 'Gender_M'] rm_features = [] preprocessor = EPICPreprocess.Preprocess(DATA_PATH, drop_cols=drop_cols) EPIC, EPIC_enc, EPIC_CUI, EPIC_arrival = preprocessor.streamline() # Get numerical columns (for later transformation) num_cols = preprocessor.which_numerical(EPIC) num_cols.remove("Primary.Dx") # Get time span time_span = EPIC_arrival['Arrived'].unique().tolist() # ---------------------------------------------------- # ========= 2.a. One-month ahead prediction ========= print("====================================") print("Dynamically evaluate the model ...\n") j = 0
# Path set-up FIG_PATH = "../../results/neural_net/" DATA_PATH = "../../data/EPIC_DATA/preprocessed_EPIC_with_dates_and_notes.csv" RAW_DATA_PATH = "../../data/EPIC_DATA/EPIC.csv" # Create folder if not already exist if not os.path.exists(FIG_PATH): os.makedirs(FIG_PATH) # ---------------------------------------------------- # ========= 1.i. Further preprocessing ========= preprocessor = EPICPreprocess.Preprocess(DATA_PATH) EPIC, EPIC_enc, EPIC_CUI, EPIC_arrival = preprocessor.streamline() # Get numerical columns (for later transformation) num_cols = preprocessor.which_numerical(EPIC) num_cols.remove("Primary.Dx") # Get time span time_span = EPIC_arrival['Arrived'].unique().tolist() # ---------------------------------------------------- # ========= 1.ii. Append arrival date ========= EPIC_raw = pd.read_csv(RAW_DATA_PATH, encoding = "ISO-8859-1") date = pd.to_datetime(EPIC_raw["Arrived"]).loc[EPIC_arrival.index] # Change name to differentiate from Arrived
RAW_TEXT_PATH = "../../data/EPIC_DATA/EPIC.csv" RAW_SAVE_DIR = FIG_PATH + "Raw_Notes/" # This is where BERT will look for pre-trained models to load parameters from. CACHE_DIR = '../../ClinicalBert/pretrained_bert_tf/biobert_pretrain_output_all_notes_150000/' # ---------------------------------------------------- # Create folder to save raw text data if not exist if not os.path.exists(RAW_SAVE_DIR): os.makedirs(RAW_SAVE_DIR) # ---------------------------------------------------- # Prepare train and test sets # Load file EPIC_original = pd.read_csv(RAW_TEXT_PATH, encoding="ISO-8859-1") preprocessor = EPICPreprocess.Preprocess(path=RAW_TEXT_PATH) EPIC_original = preprocessor.BinarizeSepsis(EPIC_original) # Only keep text columns and label notes_cols = ["Note.Data_ED.Triage.Notes"] EPIC = EPIC_original[["Primary.Dx"] + notes_cols] # ---------------------------------------------------- # ========= 1. Further preprocessing ========= # Clean the file if not already done if not os.path.exists(RAW_SAVE_DIR + "EPIC.csv"): _ = clean_epic_notes(EPIC=EPIC, EPIC_cc=EPIC_original, notes_cols=notes_cols, data_path=DATA_PATH, save_path=RAW_SAVE_DIR)
# parser = setup_parser() # args = parser.parse_args() # ---------------------------------------------------- # Path to save figures fig_path = "/".join( os.getcwd().split("/")[:3]) + "/Pictures/logistic_regression/" data_path = "/home/xingliu/Documents/ED/data/EPIC_DATA/preprocessed_EPIC_with_dates_and_notes.csv" # Create folder if not already exist if not os.path.exists(fig_path): os.makedirs(fig_path) # ---------------------------------------------------- # ========= 1. Further preprocessing ========= preprocessor = EPICPreprocess.Preprocess(data_path) EPIC, EPIC_enc, EPIC_CUI, EPIC_arrival = preprocessor.streamline() # Get numerical columns (for later transformation) num_cols = preprocessor.which_numerical(EPIC) num_cols.remove("Primary.Dx") num_cols.remove("Will.Return") # Get time span time_span = EPIC_arrival['Arrived'].unique().tolist() # ---------------------------------------------------- # ========= 2.a. One-month ahead prediction ========= print("====================================") print("Dynamically evaluate the model ...\n")
def prepare_data(data_path, mode, random_seed, validation_size=0, clean_notes=True): RANDOM_SEED = random_seed MODE = mode # MODE = "a" VALID_SIZE = validation_size # Path set-up # FIG_PATH = "../../../results/stacked_model/" FIG_PATH = data_path RAW_TEXT_PATH = "../../../data/EPIC_DATA/EPIC.csv" DATA_PATH = "../../../data/EPIC_DATA/preprocessed_EPIC_with_dates_and_notes.csv" FIG_ROOT_PATH = FIG_PATH + f"dynamic_{MODE}/" RAW_SAVE_DIR = FIG_PATH + "Raw_Notes/" CLEAN_NOTES = clean_notes # Create folder if not already exist if not os.path.exists(FIG_PATH): os.makedirs(FIG_PATH) # ---------------------------------------------------- # ========= 1.i. Further preprocessing ========= # Create folder if not already exist if not os.path.exists(RAW_SAVE_DIR): os.makedirs(RAW_SAVE_DIR) preprocessor = EPICPreprocess.Preprocess(DATA_PATH) EPIC, EPIC_enc, EPIC_CUI, EPIC_arrival = preprocessor.streamline() # Get numerical columns (for later transformation) num_cols = preprocessor.which_numerical(EPIC) num_cols.remove("Primary.Dx") # Get time span time_span = EPIC_arrival['Arrived'].unique().tolist() # ========= 1.ii. Clean text data ========= # Text data EPIC_original = pd.read_csv(RAW_TEXT_PATH, encoding='ISO-8859-1') preprocessor = EPICPreprocess.Preprocess(path=RAW_TEXT_PATH) EPIC_original = preprocessor.BinarizeSepsis(EPIC_original) # Only keep text columns and label notes_cols = ['Note.Data_ED.Triage.Notes'] EPIC = EPIC_original[['Primary.Dx'] + notes_cols] if CLEAN_NOTES: # Clean texts EPIC_text = clean_epic_notes(EPIC=EPIC, EPIC_cc=EPIC_original, notes_cols=notes_cols, data_path=DATA_PATH, save_path=RAW_SAVE_DIR, save_index=True) print("Cleaned text saved to {}".format(RAW_SAVE_DIR)) else: # Load data print("Loading cleaned text from {}".format(RAW_SAVE_DIR)) EPIC_text = pd.read_csv(RAW_SAVE_DIR + "EPIC.csv") # Assign index back EPIC_text.index = EPIC_text.iloc[:, 0] EPIC_text = EPIC_text.drop(EPIC_text.columns[0], axis=1) time_span = pickle.load(open(RAW_SAVE_DIR + "time_span", "rb")) discrepancy = (EPIC_enc.index != EPIC_text.index).sum() if discrepancy != 0: raise Warning( "EPIC numerics and text data do not match! Number of unmatched cases: {}" .format(discrepancy)) # ========= 1.iii. Prepare train/test/validation sets ========= # Splitting data by month for j, time in enumerate(time_span[2:-1]): # Month to be predicted time_pred = time_span[j + 3] # Create folder if not already exist DYNAMIC_PATH = FIG_ROOT_PATH + f"{time_pred}/" NUMERICS_DATA_PATH = DYNAMIC_PATH + "numerical_data/" print(NUMERICS_DATA_PATH) for path in [DYNAMIC_PATH, NUMERICS_DATA_PATH]: if not os.path.exists(path): os.makedirs(path) # Create BERT folder if not already exist # FIG_ROOT_PATH = FIG_ROOT_PATH + f"{TASK_NAME}/" OUTPUT_DIR = DYNAMIC_PATH + f'Saved_Checkpoints/' REPORTS_DIR = DYNAMIC_PATH + "Reports/" PROCESSED_NOTES_DIR = DYNAMIC_PATH + "Processed_Texts/" for path in [ DYNAMIC_PATH, OUTPUT_DIR, REPORTS_DIR, PROCESSED_NOTES_DIR ]: if not os.path.exists(path): os.makedirs(path) # Valid set for the first 3 months if j == 0: # Prepare train/test/valid sets # Not prepare validation set if required if VALID_SIZE == 0: XTrain, XTest, yTrain, yTest = splitter(EPIC_arrival, num_cols, MODE, time_threshold=time, test_size=None, EPIC_CUI=EPIC_CUI, seed=RANDOM_SEED) print("Saving data up to {} ...".format(time)) print( "Train size: {}. Test size: {}. Sepsis cases in [train, test]: [{}, {}]." .format(yTrain.shape, yTest.shape, yTrain.sum(), yTest.sum())) else: XTrain, XTest, XValid, yTrain, yTest, yValid = splitter( EPIC_arrival, num_cols, MODE, time_threshold=time, test_size=None, valid_size=VALID_SIZE, EPIC_CUI=EPIC_CUI, seed=RANDOM_SEED) print("Saving data up to {} ...".format(time)) print( "Train size: {}. Test size: {}. Validation size: {}. Sepsis cases in [train, test, valid]: [{}, {}, {}]." .format(yTrain.shape, yTest.shape, len(yValid), yTrain.sum(), yTest.sum(), yValid.sum())) # Get validation index valid_index = XValid.index # Get text data XValidText = EPIC_text.loc[valid_index, :] valid_bert = create_bert_data( x_data=XValidText["Note.Data_ED.Triage.Notes"], y_data=yValid, save_path=PROCESSED_NOTES_DIR + "valid.tsv") # Save numerics data XValid.to_csv(NUMERICS_DATA_PATH + "x_valid.csv", index=False) yValid.to_csv(NUMERICS_DATA_PATH + "y_valid.csv", index=False, header=True) # Labels for the text set yTrainText = yTrain else: XTrain, XTest, yTrain, yTest = splitter(EPIC_arrival, num_cols, MODE, time_threshold=time, test_size=None, EPIC_CUI=EPIC_CUI, seed=RANDOM_SEED) print("Saving data up to {} ...".format(time)) print( "Train size: {}. Test size: {}. Sepsis cases in [train, test]: [{}, {}]." .format(yTrain.shape, yTest.shape, yTrain.sum(), yTest.sum())) # Set text train data to the previous month XTrainText = XTrainTextOld yTrainText = yTrainTextOld # Save train and test sets train_index = XTrain.index test_index = XTest.index XTrainText = EPIC_text.loc[train_index, :] XTestText = EPIC_text.loc[test_index, :] # Save text data train_bert = create_bert_data( x_data=XTrainText["Note.Data_ED.Triage.Notes"], y_data=yTrainText, save_path=PROCESSED_NOTES_DIR + "train.tsv") test_bert = create_bert_data( x_data=XTestText["Note.Data_ED.Triage.Notes"], y_data=yTest, save_path=PROCESSED_NOTES_DIR + "dev.tsv") # Save numerics data XTrain.to_csv(NUMERICS_DATA_PATH + "x_train.csv", index=False) yTrain.to_csv(NUMERICS_DATA_PATH + "y_train.csv", index=False, header=True) XTest.to_csv(NUMERICS_DATA_PATH + "x_test.csv", index=False) yTest.to_csv(NUMERICS_DATA_PATH + "y_test.csv", index=False, header=True) # Only store text set of the previous month to save time in training the BERT XTrainTextOld = XTestText yTrainTextOld = yTest
CLEAN_NOTES = False # ---------------------------------------------------- # Create folder to save raw text data if not exist if not os.path.exists(RAW_SAVE_DIR): os.makedirs(RAW_SAVE_DIR) # ---------------------------------------------------- # Prepare train and test sets EPIC_original = pd.read_csv(TEXT_DATA_PATH, encoding = 'ISO-8859-1') preprocessor = EPICPreprocess.Preprocess(path = TEXT_DATA_PATH) EPIC_original = preprocessor.BinarizeSepsis(EPIC_original) # Only keep text columns and label notesCols = ['Note.Data_ED.Triage.Notes'] EPIC = EPIC_original[['Primary.Dx'] + notesCols] # ---------------------------------------------------- # ========= 1. Further preprocessing ========= if CLEAN_NOTES: # Loop over each file and write to a csv print("\nStart cleaning notes ...") # Clean text