else: patientEncounters[row[1]] = 1 if(addRow and int(row[7]) in [13,14,19,20,21]): #feature index 7 is discharge_disposition #patient was sent to a hospice or died addRow = False if(addRow): row = [row[j] for j in sub_set_indexes] if(row[-1]=='Yes'): row[-1] = 1 readmitted.append(row) else: row[-1] = 0 no_readmitted.append(row) print 'number of readmissions:', len(readmitted) sub_set = random.sample(no_readmitted, len(readmitted)) + readmitted random.shuffle(sub_set) data_writer.writerows(sub_set) repitedEncounters = {k:v for (k,v) in patientEncounters.items() if(v>1)} print len(repitedEncounters) print DatabaseManager.get_sub_feature_indexes() print DatabaseManager.get_left_out_feature_indexes() print DatabaseManager.get_indexes_to_scale() print DatabaseManager.get_indexes_to_encode() print DatabaseManager.get_indexes_to_hot_encode()
import csv import sys sys.path.append("../data/dataset_diabetes") import DatabaseManager print "Using diabetes dataset" data_reader = csv.reader(open("../data/dataset_diabetes/subset_features_data.csv", "rb")) headers = data_reader.next() data_list = [row for row in data_reader] temp_data_mat = np.array(data_list) # We need to convert categorical data to ints/floats so we can use one hot encoding data_mat = [] for (index, col) in enumerate(temp_data_mat.T): if index in DatabaseManager.get_indexes_to_encode(): unique_vals = [] for (ii, item) in enumerate(col): if item not in unique_vals: unique_vals.append(item) if item == "?": col[ii] = "NaN" else: col[ii] = unique_vals.index(item) data_mat.append(col) # convert out of the column format data_mat = np.array(data_mat).T # Imputer converts missing values (?'s) to the mean of the column