示例#1
0
def clean_epic_notes(EPIC,
                     EPIC_cc,
                     notes_cols,
                     data_path,
                     save_path=None,
                     save_index=False):
    '''
    Fill in missing notes by CC and clean notes to fit the BERT format.

    Input : EPIC = [DataFrame] Data containing notes.
            EPIC_cc = [DataFrame] Must contain "CC".
            notes_cols = [list] Column name of the notes to be cleaned.
            data_path = [str] path to the raw EPIC data (must contain notes).
            save_path = [str] path to save the cleaned notes.
            save_index = [bool] whether to save the index of the cleaned notes.
    Output:
            EPIC with notes cleaned.
    '''
    # Loop over each file and write to a csv
    print("\nStart cleaning notes ...")

    # Clean text
    for col in notes_cols:
        print("Cleaning {}".format(col))
        EPIC.loc[:, col] = list(map(clean_text, EPIC[col]))

    # Save data
    EPIC.to_csv(save_path + 'EPIC_triage.csv', index=False)

    # Load data nonetheless to convert empty notes "" to nan
    EPIC = pd.read_csv(save_path + 'EPIC_triage.csv')

    # Fill in missing vals
    EPIC = fill_missing_text(EPIC, EPIC_cc, notes_cols)

    # Save imputed text
    EPIC.to_csv(save_path + 'EPIC_triage.csv', index=False)

    # Further preprocessing
    preprocessor = EPICPreprocess.Preprocess(data_path)
    _, _, _, EPIC_arrival = preprocessor.streamline()

    # Remove the obvious outliers
    EPIC = EPIC.loc[EPIC_arrival.index, :]

    # Add time variable
    EPIC = pd.concat([EPIC, EPIC_arrival["Arrived"].astype(int)], axis=1)

    # Get time span
    time_span = EPIC['Arrived'].unique().tolist()

    # Save data
    if save_path is not None:
        pickle.dump(time_span, open(save_path + "time_span", "wb"))
        if not save_index:
            EPIC.to_csv(save_path + 'EPIC.csv', index=False)
        else:
            EPIC.to_csv(save_path + 'EPIC.csv', index=True)
    return EPIC
示例#2
0
drop_cols = [
    'Distance.To.Sick.Kids', 'Distance.To.Walkin', 'Distance.To.Hospital',
    'Systolic', 'First.ED.Provider', 'Last.ED.Provider',
    'ED.Longest.Attending.ED.Provider'
]
drop_cols = ['Name.Of.Walkin', 'Day.of.Arrival']

drop_cols = [
    'First.ED.Provider', 'Last.ED.Provider',
    'ED.Longest.Attending.ED.Provider', 'Admitting.Provider',
    'Name.Of.Hospital'
]
rm_features = ['Systolic', 'Day.of.Arrival_Monday', 'Gender_M']
rm_features = []

preprocessor = EPICPreprocess.Preprocess(DATA_PATH, drop_cols=drop_cols)
EPIC, EPIC_enc, EPIC_CUI, EPIC_arrival = preprocessor.streamline()

# Get numerical columns (for later transformation)
num_cols = preprocessor.which_numerical(EPIC)
num_cols.remove("Primary.Dx")

# Get time span
time_span = EPIC_arrival['Arrived'].unique().tolist()

# ----------------------------------------------------
# ========= 2.a. One-month ahead prediction =========
print("====================================")
print("Dynamically evaluate the model ...\n")

j = 0
示例#3
0

# Path set-up
FIG_PATH = "../../results/neural_net/"
DATA_PATH = "../../data/EPIC_DATA/preprocessed_EPIC_with_dates_and_notes.csv"
RAW_DATA_PATH = "../../data/EPIC_DATA/EPIC.csv"


# Create folder if not already exist
if not os.path.exists(FIG_PATH):
    os.makedirs(FIG_PATH)


# ----------------------------------------------------
# ========= 1.i. Further preprocessing =========
preprocessor = EPICPreprocess.Preprocess(DATA_PATH)
EPIC, EPIC_enc, EPIC_CUI, EPIC_arrival = preprocessor.streamline()

# Get numerical columns (for later transformation)
num_cols = preprocessor.which_numerical(EPIC)
num_cols.remove("Primary.Dx")

# Get time span
time_span = EPIC_arrival['Arrived'].unique().tolist()


# ----------------------------------------------------
# ========= 1.ii. Append arrival date =========
EPIC_raw = pd.read_csv(RAW_DATA_PATH, encoding = "ISO-8859-1")
date = pd.to_datetime(EPIC_raw["Arrived"]).loc[EPIC_arrival.index]
# Change name to differentiate from Arrived
示例#4
0
RAW_TEXT_PATH = "../../data/EPIC_DATA/EPIC.csv"
RAW_SAVE_DIR = FIG_PATH + "Raw_Notes/"

# This is where BERT will look for pre-trained models to load parameters from.
CACHE_DIR = '../../ClinicalBert/pretrained_bert_tf/biobert_pretrain_output_all_notes_150000/'

# ----------------------------------------------------
# Create folder to save raw text data if not exist
if not os.path.exists(RAW_SAVE_DIR):
    os.makedirs(RAW_SAVE_DIR)

# ----------------------------------------------------
# Prepare train and test sets
# Load file
EPIC_original = pd.read_csv(RAW_TEXT_PATH, encoding="ISO-8859-1")
preprocessor = EPICPreprocess.Preprocess(path=RAW_TEXT_PATH)
EPIC_original = preprocessor.BinarizeSepsis(EPIC_original)

# Only keep text columns and label
notes_cols = ["Note.Data_ED.Triage.Notes"]
EPIC = EPIC_original[["Primary.Dx"] + notes_cols]

# ----------------------------------------------------
# ========= 1. Further preprocessing =========
# Clean the file if not already done
if not os.path.exists(RAW_SAVE_DIR + "EPIC.csv"):
    _ = clean_epic_notes(EPIC=EPIC,
                         EPIC_cc=EPIC_original,
                         notes_cols=notes_cols,
                         data_path=DATA_PATH,
                         save_path=RAW_SAVE_DIR)
示例#5
0
# parser = setup_parser()
# args = parser.parse_args()

# ----------------------------------------------------
# Path to save figures
fig_path = "/".join(
    os.getcwd().split("/")[:3]) + "/Pictures/logistic_regression/"
data_path = "/home/xingliu/Documents/ED/data/EPIC_DATA/preprocessed_EPIC_with_dates_and_notes.csv"

# Create folder if not already exist
if not os.path.exists(fig_path):
    os.makedirs(fig_path)

# ----------------------------------------------------
# ========= 1. Further preprocessing =========
preprocessor = EPICPreprocess.Preprocess(data_path)
EPIC, EPIC_enc, EPIC_CUI, EPIC_arrival = preprocessor.streamline()

# Get numerical columns (for later transformation)
num_cols = preprocessor.which_numerical(EPIC)
num_cols.remove("Primary.Dx")
num_cols.remove("Will.Return")

# Get time span
time_span = EPIC_arrival['Arrived'].unique().tolist()

# ----------------------------------------------------
# ========= 2.a. One-month ahead prediction =========
print("====================================")
print("Dynamically evaluate the model ...\n")
示例#6
0
def prepare_data(data_path,
                 mode,
                 random_seed,
                 validation_size=0,
                 clean_notes=True):
    RANDOM_SEED = random_seed
    MODE = mode
    # MODE = "a"
    VALID_SIZE = validation_size
    # Path set-up
    # FIG_PATH = "../../../results/stacked_model/"
    FIG_PATH = data_path
    RAW_TEXT_PATH = "../../../data/EPIC_DATA/EPIC.csv"
    DATA_PATH = "../../../data/EPIC_DATA/preprocessed_EPIC_with_dates_and_notes.csv"
    FIG_ROOT_PATH = FIG_PATH + f"dynamic_{MODE}/"
    RAW_SAVE_DIR = FIG_PATH + "Raw_Notes/"
    CLEAN_NOTES = clean_notes
    # Create folder if not already exist
    if not os.path.exists(FIG_PATH):
        os.makedirs(FIG_PATH)
    # ----------------------------------------------------
    # ========= 1.i. Further preprocessing =========
    # Create folder if not already exist
    if not os.path.exists(RAW_SAVE_DIR):
        os.makedirs(RAW_SAVE_DIR)
    preprocessor = EPICPreprocess.Preprocess(DATA_PATH)
    EPIC, EPIC_enc, EPIC_CUI, EPIC_arrival = preprocessor.streamline()
    # Get numerical columns (for later transformation)
    num_cols = preprocessor.which_numerical(EPIC)
    num_cols.remove("Primary.Dx")
    # Get time span
    time_span = EPIC_arrival['Arrived'].unique().tolist()
    # ========= 1.ii. Clean text data =========
    # Text data
    EPIC_original = pd.read_csv(RAW_TEXT_PATH, encoding='ISO-8859-1')
    preprocessor = EPICPreprocess.Preprocess(path=RAW_TEXT_PATH)
    EPIC_original = preprocessor.BinarizeSepsis(EPIC_original)
    # Only keep text columns and label
    notes_cols = ['Note.Data_ED.Triage.Notes']
    EPIC = EPIC_original[['Primary.Dx'] + notes_cols]
    if CLEAN_NOTES:
        # Clean texts
        EPIC_text = clean_epic_notes(EPIC=EPIC,
                                     EPIC_cc=EPIC_original,
                                     notes_cols=notes_cols,
                                     data_path=DATA_PATH,
                                     save_path=RAW_SAVE_DIR,
                                     save_index=True)
        print("Cleaned text saved to {}".format(RAW_SAVE_DIR))
    else:
        # Load data
        print("Loading cleaned text from {}".format(RAW_SAVE_DIR))
        EPIC_text = pd.read_csv(RAW_SAVE_DIR + "EPIC.csv")
        # Assign index back
        EPIC_text.index = EPIC_text.iloc[:, 0]
        EPIC_text = EPIC_text.drop(EPIC_text.columns[0], axis=1)
        time_span = pickle.load(open(RAW_SAVE_DIR + "time_span", "rb"))
    discrepancy = (EPIC_enc.index != EPIC_text.index).sum()
    if discrepancy != 0:
        raise Warning(
            "EPIC numerics and text data do not match! Number of unmatched cases: {}"
            .format(discrepancy))
    # ========= 1.iii. Prepare train/test/validation sets =========
    # Splitting data by month
    for j, time in enumerate(time_span[2:-1]):
        # Month to be predicted
        time_pred = time_span[j + 3]
        # Create folder if not already exist
        DYNAMIC_PATH = FIG_ROOT_PATH + f"{time_pred}/"
        NUMERICS_DATA_PATH = DYNAMIC_PATH + "numerical_data/"
        print(NUMERICS_DATA_PATH)
        for path in [DYNAMIC_PATH, NUMERICS_DATA_PATH]:
            if not os.path.exists(path):
                os.makedirs(path)
        # Create BERT folder if not already exist
        # FIG_ROOT_PATH = FIG_ROOT_PATH + f"{TASK_NAME}/"
        OUTPUT_DIR = DYNAMIC_PATH + f'Saved_Checkpoints/'
        REPORTS_DIR = DYNAMIC_PATH + "Reports/"
        PROCESSED_NOTES_DIR = DYNAMIC_PATH + "Processed_Texts/"
        for path in [
                DYNAMIC_PATH, OUTPUT_DIR, REPORTS_DIR, PROCESSED_NOTES_DIR
        ]:
            if not os.path.exists(path):
                os.makedirs(path)
        # Valid set for the first 3 months
        if j == 0:
            # Prepare train/test/valid sets
            # Not prepare validation set if required
            if VALID_SIZE == 0:
                XTrain, XTest, yTrain, yTest = splitter(EPIC_arrival,
                                                        num_cols,
                                                        MODE,
                                                        time_threshold=time,
                                                        test_size=None,
                                                        EPIC_CUI=EPIC_CUI,
                                                        seed=RANDOM_SEED)
                print("Saving data up to {} ...".format(time))
                print(
                    "Train size: {}. Test size: {}. Sepsis cases in [train, test]: [{}, {}]."
                    .format(yTrain.shape, yTest.shape, yTrain.sum(),
                            yTest.sum()))
            else:
                XTrain, XTest, XValid, yTrain, yTest, yValid = splitter(
                    EPIC_arrival,
                    num_cols,
                    MODE,
                    time_threshold=time,
                    test_size=None,
                    valid_size=VALID_SIZE,
                    EPIC_CUI=EPIC_CUI,
                    seed=RANDOM_SEED)
                print("Saving data up to {} ...".format(time))
                print(
                    "Train size: {}. Test size: {}. Validation size: {}. Sepsis cases in [train, test, valid]: [{}, {}, {}]."
                    .format(yTrain.shape, yTest.shape, len(yValid),
                            yTrain.sum(), yTest.sum(), yValid.sum()))
                # Get validation index
                valid_index = XValid.index
                # Get text data
                XValidText = EPIC_text.loc[valid_index, :]
                valid_bert = create_bert_data(
                    x_data=XValidText["Note.Data_ED.Triage.Notes"],
                    y_data=yValid,
                    save_path=PROCESSED_NOTES_DIR + "valid.tsv")
                # Save numerics data
                XValid.to_csv(NUMERICS_DATA_PATH + "x_valid.csv", index=False)
                yValid.to_csv(NUMERICS_DATA_PATH + "y_valid.csv",
                              index=False,
                              header=True)
                # Labels for the text set
                yTrainText = yTrain
        else:
            XTrain, XTest, yTrain, yTest = splitter(EPIC_arrival,
                                                    num_cols,
                                                    MODE,
                                                    time_threshold=time,
                                                    test_size=None,
                                                    EPIC_CUI=EPIC_CUI,
                                                    seed=RANDOM_SEED)
            print("Saving data up to {} ...".format(time))
            print(
                "Train size: {}. Test size: {}. Sepsis cases in [train, test]: [{}, {}]."
                .format(yTrain.shape, yTest.shape, yTrain.sum(), yTest.sum()))
            # Set text train data to the previous month
            XTrainText = XTrainTextOld
            yTrainText = yTrainTextOld
        # Save train and test sets
        train_index = XTrain.index
        test_index = XTest.index
        XTrainText = EPIC_text.loc[train_index, :]
        XTestText = EPIC_text.loc[test_index, :]
        # Save text data
        train_bert = create_bert_data(
            x_data=XTrainText["Note.Data_ED.Triage.Notes"],
            y_data=yTrainText,
            save_path=PROCESSED_NOTES_DIR + "train.tsv")
        test_bert = create_bert_data(
            x_data=XTestText["Note.Data_ED.Triage.Notes"],
            y_data=yTest,
            save_path=PROCESSED_NOTES_DIR + "dev.tsv")
        # Save numerics data
        XTrain.to_csv(NUMERICS_DATA_PATH + "x_train.csv", index=False)
        yTrain.to_csv(NUMERICS_DATA_PATH + "y_train.csv",
                      index=False,
                      header=True)
        XTest.to_csv(NUMERICS_DATA_PATH + "x_test.csv", index=False)
        yTest.to_csv(NUMERICS_DATA_PATH + "y_test.csv",
                     index=False,
                     header=True)
        # Only store text set of the previous month to save time in training the BERT
        XTrainTextOld = XTestText
        yTrainTextOld = yTest
示例#7
0
CLEAN_NOTES = False




# ----------------------------------------------------
# Create folder to save raw text data if not exist
if not os.path.exists(RAW_SAVE_DIR):
    os.makedirs(RAW_SAVE_DIR)


# ----------------------------------------------------
# Prepare train and test sets
EPIC_original = pd.read_csv(TEXT_DATA_PATH, encoding = 'ISO-8859-1')
preprocessor = EPICPreprocess.Preprocess(path = TEXT_DATA_PATH)
EPIC_original = preprocessor.BinarizeSepsis(EPIC_original)


# Only keep text columns and label
notesCols = ['Note.Data_ED.Triage.Notes']
EPIC = EPIC_original[['Primary.Dx'] + notesCols]


# ----------------------------------------------------
# ========= 1. Further preprocessing =========

if CLEAN_NOTES:
    # Loop over each file and write to a csv
    print("\nStart cleaning notes ...")
    # Clean text