Exemplo n.º 1
0
def build_meds_frame(ta, frame_name, in_path, overwrite = True):

    print "*** CREATING DRUG HISTORY FRAME at " + frame_name
    if (frame_name in ta.get_frame_names()):
        if (not overwrite):
            return ta.get_frame(frame_name)
        else:
            ta.drop_frames([frame_name])

    add_udf_files(ta, [clean_drugs, derived_features, history_utilities, uphs_fields, uphs_schema])

    data_fields = [ PATID, VISID, ADM_DATE,  MED_ORDER_NAMEs, DISCHARGE_MED_ORDER_NAMEs ]
    data_frame = loader.load_frame(ta, in_path, frame_name, data_fields)

    print "Filtering out rows with missing or junk patient ids and/or visit ids"

    data_frame.filter(lambda row:  row[PATID] != None and row[VISID] != None and [PATID] != 'None' and row[VISID] != 'None')
    data_defaults = {MED_ORDER_NAMEs : "", DISCHARGE_MED_ORDER_NAMEs: ""}

    print "Imputing missing drug fields with empty lists"
    impute_with_constants(data_frame, data_defaults)


    data_frame.add_columns(lambda row: row[MED_ORDER_NAMEs] + ", " + row[DISCHARGE_MED_ORDER_NAMEs], (UNCLEAN_COMBINED_MEDLIST, str))
    data_frame.drop_columns([MED_ORDER_NAMEs, DISCHARGE_MED_ORDER_NAMEs])

    data_frame.add_columns(lambda row: drug_cleaner.to_clean_doc(row[UNCLEAN_COMBINED_MEDLIST], medlist_delimiter), (COMBINED_MEDLIST, str))
    data_frame.drop_columns([UNCLEAN_COMBINED_MEDLIST])

    return data_frame
def create_crossval_labeled_patids(ta, frame_name, in_path, overwrite = True):
    fields = [PATID]

    print "****** Assigning train/test labels for cross validation... loading data"
    if (frame_name in ta.get_frame_names()):
        if (not overwrite):
            return ta.get_frame(frame_name)
        else:
            ta.drop_frames(frame_name)


    frame = load_frame(ta, in_path, frame_name, fields)

    print "****** Assigning train/test labels for cross validation... identiyfing patient population"
    add_udf_files(ta, [uphs_fields, filters])
    frame.filter(lambda row: patid_filter(row))
    frame.drop_duplicates()

    print "****** Assigning train/test labels for cross validation... assigning labels"
    frame.assign_sample(sample_percentages = [0.9, 0.1],
                        sample_labels = [TRAIN_LABEL, TEST_LABEL],
                        random_seed = 1776,
                        output_column = CROSS_VALIDATION_CLASS)

    return frame
Exemplo n.º 3
0
def create_ground_truth(ta, frame_name, in_path, overwrite = False):

    add_udf_files(ta, [uphs_fields, history_utilities,  ground_truth_utilities,  derived_features, filters])

    if (frame_name in ta.get_frame_names()):
        if (not overwrite):
            return ta.get_frame(frame_name)
        else:
            ta.drop_frames([frame_name])

    fields = [PATID, VISID, ADMIT_TYPE, ADM_DATE, DISCHARGE_DATE]

    base_frame_name =  '__' + frame_name + '_uphs_demo_visit_types'
    jsonized_frame_name = '__' + frame_name + '_jsonized_visits'
    json_col_name = 'visit_history'

    print "*** CREATE GROUND TRUTH FRAME AT FRAME: " + frame_name

    print "******  loading data from raw json"
    base_frame = load_frame(ta, in_path, base_frame_name, fields)

    print "****** filtering out bad rows with missing information"
    base_frame.filter(lambda row: patid_filter(row) and visid_filter(row) and adm_date_filter(row) and emergency_admit_filter(row))


    print "****** creating visit history atomic records"
    jsonized_frame = history_features.jsonize(ta, base_frame, jsonized_frame_name, json_col_name, key_col_name = PATID, overwrite = overwrite)

    print "****** collecting visit history"
    # historized_frame gets the output frame name since all subsequent changes are in-place and mutate the frame..
    # it will be the frame that goes out as the ground truth frame

    ground_truth_history_delimiter = '|'
    historized_frame = history_features.historize(ta, jsonized_frame, frame_name, json_col_name,
                                                  PATID, delimiter = ground_truth_history_delimiter)


    print "****** calculating READMIT_30 and READMIT_90 scores"

    LABELED_VISIT_JSON = 'LABELED_VISIT_JSON'
    # each patient gets a list of visits and their readmit scores
    historized_frame.add_columns(lambda row: per_visit_readmit_scores(row[json_col_name]), (LABELED_VISIT_JSON, str))

    historized_frame.drop_columns([json_col_name])

    # per patient list of visit/label readmit scores collapsed to per patient-visit readmit records
    historized_frame.flatten_column(LABELED_VISIT_JSON, delimiter = ground_truth_history_delimiter)

    print "******  placing READMIT_30 and READMIT_90 data into desired tabular format"
    # expanding json records per visit into multiple columns
    historized_frame.add_columns(lambda row: labeled_visit_to_columns(row[LABELED_VISIT_JSON]), labeled_visit_schema)
    historized_frame.drop_columns([LABELED_VISIT_JSON])

    ta.drop_frames([base_frame, jsonized_frame])

    return historized_frame
Exemplo n.º 4
0
def get_basic_features(ta, frame_name, in_path, overwrite = True):

    if (frame_name in ta.get_frame_names()):
        if (not overwrite):
            return ta.get_frame(frame_name)
        else:
            ta.drop_frames(frame_name)

    frame = loader.load_frame(ta, in_path, frame_name, data_columns)

    add_udf_files(ta, [uphs_fields, filters])

    frame.filter(lambda row: patid_filter(row) and visid_filter(row))

    impute_with_constants(frame, default_values)

    return frame