Пример #1
0
def classify(k, sample, training_data, att_names = None):

	if att_names == None:
		att_names = data_preprocessing.get_header()

	attribute_indexes = convert_att_names_to_indexes(att_names)
#	class_index = att_names.index("class")
	class_index = data_preprocessing.get_header().index("class")

	distances = []
	class0_count = 0
	class1_count = 0

	for training_sample in training_data:
		dist = euclid_distance_squared(sample, training_sample, attribute_indexes)
		heappush(distances, (dist, training_sample) )

	for i in range(k):
		(_, training_sample) = heappop(distances)
		if training_sample[class_index] == "class0":
			class0_count+=1
		if training_sample[class_index] == "class1":
			class1_count+=1

	if class0_count > class1_count:
		if sample[class_index] == 'class0':
			return 0,True
		return 0,False
	else:
		if sample[class_index] == 'class1':
			return 1,True
		return 1,False
Пример #2
0
def convert_att_names_to_indexes(attributes):
	attribute_indexes = []
	for attribute_name in attributes:
		if attribute_name == "class":
			continue
		index = data_preprocessing.get_header().index(attribute_name)
		if index > -1:
			attribute_indexes.append(index)

	return attribute_indexes
Пример #3
0
def calculate_PDF(inputArray,class_zero,class_one,attr_names=False):
	if attr_names == False:
		headers = data_preprocessing.get_header()
	else:
		headers = attr_names
	pdf_array = {'class_zero':{},'class_one':{}}
	for header in headers[:-1]:
		pdf_array['class_zero'][header] = PDF_math(inputArray[header],class_zero[header]['mean'],class_zero[header]['sd'])
		pdf_array['class_one'][header] = PDF_math(inputArray[header],class_one[header]['mean'],class_one[header]['sd'])
	return pdf_array
Пример #4
0
def convert_array_to_dict(inputArray,custom_headers=False):
	headers = data_preprocessing.get_header()
	outputArray = {}
	for header in headers: # don't include class
		outputArray[header] = inputArray[headers.index(header)]
	if (custom_headers != False): # subset of headers given
		temp_output = {}
		for key in custom_headers:
			if key in outputArray:
				temp_output[key] = outputArray[key]
		outputArray = temp_output
	return outputArray
Пример #5
0
def calculate_mean_sd(inputData,attr_names=False):
	if attr_names == False:
		headers = data_preprocessing.get_header() # headers to data
	else:
		headers = attr_names
	class_one = {} 	# People with Diabetes
	class_zero = {} 	# People without Diabetes

	# Prepare arrays with initial data
	for header in headers[:-1]:
		class_one[header] = {'mean':0,"sd":0}
		class_zero[header] = {'mean':0,"sd":0}
	else:
		class_one['size'] = 0
		class_zero['size'] = 0

	# Calculate Mean #
	for row in inputData:
		dictRow = convert_array_to_dict(row,attr_names)
		class_name = dictRow.pop("class")
		if class_name == "class1": # for class_one
			for key in dictRow.keys():
				class_one[key]['mean'] += dictRow[key]
			class_one['size'] += 1 # increment
		else: # for class_zero
			for key in dictRow.keys():
				class_zero[key]['mean'] += dictRow[key]
			class_zero['size'] += 1 # increment

	for header in headers[:-1]:
		class_zero[header]['mean'] = class_zero[header]['mean']/class_zero['size']
		class_one[header]['mean'] = class_one[header]['mean']/class_one['size']

	# Calculate SD
	for row in inputData:
		dictRow = convert_array_to_dict(row, attr_names)
		class_name = dictRow.pop("class")
		if class_name == "class1": # for class_one
			for key in dictRow.keys():
				class_one[key]['sd'] += math.pow((dictRow[key]-class_one[key]['mean']),2) # (xi - mean)^2
		else: # for class_zero
			for key in dictRow.keys():
				class_zero[key]['sd'] += math.pow((dictRow[key]-class_zero[key]['mean']),2)

	for header in headers[:-1]:
		class_zero[header]['sd'] = math.sqrt(class_zero[header]['sd']/class_zero['size']) # (total_sum/N)^1/2
		class_one[header]['sd'] = math.sqrt(class_one[header]['sd']/class_one['size'])

	return class_zero,class_one;
Пример #6
0
def classify(inputArray,class_zero,class_one,attr_names=False):
	if attr_names == False:
		headers = data_preprocessing.get_header()
	else:
		headers = attr_names
	inputArray = convert_array_to_dict(inputArray, attr_names)
	pdf_array = calculate_PDF(inputArray,class_zero,class_one,attr_names)
	test_one_val = float(class_one['size'])/float(class_one['size']+class_zero['size']) # total percentage of classOne
	test_zero_val =  float(class_zero['size'])/float(class_one['size']+class_zero['size']) # total percentage of classZero
	for header in headers[:-1]: # multiplying out the bayes value for 0 and 1
		test_one_val = test_one_val * pdf_array['class_one'][header]
		test_zero_val = test_zero_val * pdf_array['class_zero'][header]
	#print "one: %f zero: %f "%(test_one_val,test_zero_val)
	if ((test_one_val - test_zero_val) >= 0):
		if inputArray['class'] == 'class1': # return True if actual == calculated
			return 1,True # for Diabetic
		return 1,False
	else:
		if inputArray['class'] == 'class0': # return True if actual == calculated
			return 0,True
		return 0,False # for Non-Diabetic