else:
		patientEncounters[row[1]] = 1

	if(addRow and int(row[7]) in [13,14,19,20,21]):
	#feature index 7 is discharge_disposition
	#patient was sent to a hospice or died
		addRow = False
	
	if(addRow):
		row = [row[j] for j in sub_set_indexes]
		if(row[-1]=='Yes'):
			row[-1] = 1
			readmitted.append(row)
		else:
			row[-1] = 0
			no_readmitted.append(row)

print 'number of readmissions:', len(readmitted)
sub_set = random.sample(no_readmitted, len(readmitted)) + readmitted
random.shuffle(sub_set)
data_writer.writerows(sub_set)


repitedEncounters = {k:v for (k,v) in patientEncounters.items() if(v>1)}
print len(repitedEncounters)

print DatabaseManager.get_sub_feature_indexes()
print DatabaseManager.get_left_out_feature_indexes()
print DatabaseManager.get_indexes_to_scale()
print DatabaseManager.get_indexes_to_encode()
print DatabaseManager.get_indexes_to_hot_encode()
Пример #2
0
import csv
import sys

sys.path.append("../data/dataset_diabetes")
import DatabaseManager

print "Using diabetes dataset"
data_reader = csv.reader(open("../data/dataset_diabetes/subset_features_data.csv", "rb"))
headers = data_reader.next()
data_list = [row for row in data_reader]

temp_data_mat = np.array(data_list)
# We need to convert categorical data to ints/floats so we can use one hot encoding
data_mat = []
for (index, col) in enumerate(temp_data_mat.T):
    if index in DatabaseManager.get_indexes_to_encode():
        unique_vals = []
        for (ii, item) in enumerate(col):
            if item not in unique_vals:
                unique_vals.append(item)

            if item == "?":
                col[ii] = "NaN"
            else:
                col[ii] = unique_vals.index(item)
    data_mat.append(col)

# convert out of the column format
data_mat = np.array(data_mat).T

# Imputer converts missing values (?'s) to the mean of the column