def output_graph(self, geography="nyc", filename=None):
        """Output a PDF of a plot showing the grades over time in a certain
        geography, either 'nyc' for the whole city, or each
        borough ('bronx', 'queens', 'brooklyn', 'manhattan', 'staten')."""
        print "Outputting figure...%s" % filename
        sys.stdout.flush()

        # Get counts and percents
        counts, pcts = self.get_grade_counts_by_year(self.cut_to_geography(geography))
        counts.fillna(value=0, inplace=True)
        pcts.fillna(value=0, inplace=True)
        # Create axes and figure
        fig, axes = plt.subplots(2, 1, sharex=False)
        fig.set_size_inches(7, 11)
        fig.subplots_adjust(right=.8)

        percentage_graph(pcts, axes[0])
        bar_graph(counts, axes[1])

        if filename is not None:
            fig.savefig(filename)
        else:
            plt.show()
Пример #2
0
def select_best_model(training_X,training_Y,data_title=None):
	##print training_X.shape[0]
	##split data into 4 section by every 4th item
	list1x = training_X[0::10]
	list2x = training_X[1::10]
	list3x = training_X[2::10]
	list4x = training_X[3::10]
	list5x = training_X[4::10]	
	list6x = training_X[5::10]
	list7x = training_X[6::10]
	list8x = training_X[7::10]
	list9x = training_X[8::10]
	list10x = training_X[9::10]

	list1y = training_Y[0::10]
	list2y = training_Y[1::10]
	list3y = training_Y[2::10]
	list4y = training_Y[3::10]
	list5y = training_Y[4::10]
	list6y = training_Y[5::10]
	list7y = training_Y[6::10]
	list8y = training_Y[7::10]
	list9y = training_Y[8::10]
	list10y = training_Y[9::10]


	train_set_X = numpy.concatenate((list2x,list3x,list4x,list5x,list6x,list7x))
	train_set_Y = numpy.concatenate((list2y,list3y,list4y,list5y,list6y,list7y))
	validate_set_X = list8x
	validate_set_Y = list8y


	full_train_set_X = numpy.concatenate((list2x,list3x,list4x,list5x,list6x,list7x,list8x))
	full_train_set_Y = numpy.concatenate((list2y,list3y,list4y,list5y,list6y,list7y,list8y))
	test_set_X = numpy.concatenate((list1x,list9x,list10x))
	test_set_Y = numpy.concatenate((list1y,list9y,list10y))



	##evuate and find best alpha for each model
	start_time = time.clock()
	dict_alpha0 = optimize_alpha("dc_tree",train_set_X,train_set_Y,validate_set_X,validate_set_Y)
	best_alpha0 = int(argmax(dict_alpha0)[0])
	print "\talpha: "+str(best_alpha0)
	train_time0 = time.clock() - start_time

	start_time = time.clock()
	y_test0 = scikit_dc_tree(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha0)
	predict0 = time.clock() - start_time
	print

	best_alpha1 = 0.5
	y_test1 = 0.5

	start_time = time.clock()
	dict_alpha2 = {"5":0.5}
	best_alpha2 = float(argmax(dict_alpha2)[0])
	print "\talpha: "+str(best_alpha2)
	train_time2 = time.clock() - start_time

	start_time = time.clock()
	y_test2 = 0.5
	predict2 = time.clock() - start_time
	print

	##ensemble classifiers:

	start_time = time.clock()
	dict_alpha3 = optimize_alpha("ran_forest",train_set_X,train_set_Y,validate_set_X,validate_set_Y)
	best_alpha3 = int(argmax(dict_alpha3)[0])
	best_beta3 = int(argmax(dict_alpha3)[1])
	print "\talpha: "+str(best_alpha3)+" beta: "+str(best_beta3)
	train_time3 = time.clock() - start_time

	start_time = time.clock()
	y_test3 = scikit_ran_forest(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha=best_alpha3,beta=best_beta3)
	predict3 = time.clock() - start_time
	print



	start_time = time.clock()
	dict_alpha4 = optimize_alpha("ada_boost",train_set_X,train_set_Y,validate_set_X,validate_set_Y)
	best_alpha4 = int(argmax(dict_alpha4)[0])
	print "\talpha: "+str(best_alpha4)
	train_time4 = time.clock() - start_time

	start_time = time.clock()
	y_test4 = scikit_ada_boost(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha4)
	predict4 = time.clock() - start_time
	print



	""" SVM extras too slow
	start_time = time.clock()
	dict_alpha2 = optimize_alpha("svm",train_set_X,train_set_Y,validate_set_X,validate_set_Y)
	best_alpha2 = float(argmax(dict_alpha2))
	print "\t"+str(best_alpha2)
	train_time2 = time.clock() - start_time

	start_time = time.clock()
	y_test2 = scikit_onevsrest(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha0)
	predict2 = time.clock() - start_time
	print
	"""


	## consolidate and output final result
	dict_choices = {"dc_tree":y_test0,"knn":y_test1,"log_reg":y_test2,"ran_forest":y_test3,"ada_boost":y_test4}
	dict_alpha = {"dc_tree":best_alpha0,"knn":best_alpha1,"log_reg":best_alpha2,"ran_forest":best_alpha3,"ada_boost":best_alpha4}
	dict_beta = {"ran_forest":best_beta3}
	print dict_choices
	final_model = argmax(dict_choices)[0]


	##prep for bar graphs
	dict_errors = dict_choices
	for key, value in dict_errors.iteritems():
		dict_errors[key] = 1- value 
	errors = [dict_errors["dc_tree"],dict_errors["log_reg"]]
	traintimes = [train_time0,train_time2]
	predicttimes = [predict0,predict2]

	##plot accuracy
	plotting.bar_graph(data_title,"decisiontree","log_reg",[1,2],errors,maxy=max(errors)*1.1,ylabel="errors")
	##plot training time
	plotting.bar_graph(data_title,"decisiontree","log_reg",[1,2],traintimes,maxy=max(traintimes)*1.1,ylabel="training times(s)")
	##plot prediction time
	plotting.bar_graph(data_title,"decisiontree","log_reg",[1,2],predicttimes,maxy=max(predicttimes)*1.1,ylabel="prediction times(s)")



	#prep for line graphs
	alpha_X =[]
	alpha_X_ints = []
	for i in range(1, 2, 1):
		for factor in range(3,10,3):
			alpha =  10**(i)*factor;
			if (alpha > 1):
				alpha_X.append(str(alpha))			
				alpha_X_ints.append(alpha)

	alpha_Y_dc = []
	alpha_Y_log_reg = []


	dict_error0 = dict_alpha0
	for key, value in dict_alpha0.iteritems():
		dict_error0[key] = 1.0- value 

	dict_error2 = dict_alpha2
	for key, value in dict_alpha2.iteritems():
		dict_error2[key] = 1.0- value 


	for xvalue in alpha_X:
		alpha_Y_dc.append(dict_error0[xvalue])
		alpha_Y_log_reg.append(dict_error2[xvalue])

	alpha_Y = alpha_Y_dc,alpha_Y_log_reg


	#plot line graph
	plotting.line_graph_alpha_error(data_title,"dc_tree","log_reg",alpha_X_ints,alpha_Y)

	return final_model,dict_alpha[final_model],best_beta3
def select_best_model(training_X,training_Y,data_title=None):
	##print training_X.shape[0]
	##split data into 4 section by every 4th item
	list1x = training_X[0::10]
	list2x = training_X[1::10]
	list3x = training_X[2::10]
	list4x = training_X[3::10]
	list5x = training_X[4::10]	
	list6x = training_X[5::10]
	list7x = training_X[6::10]
	list8x = training_X[7::10]
	list9x = training_X[8::10]
	list10x = training_X[9::10]

	list1y = training_Y[0::10]
	list2y = training_Y[1::10]
	list3y = training_Y[2::10]
	list4y = training_Y[3::10]
	list5y = training_Y[4::10]
	list6y = training_Y[5::10]
	list7y = training_Y[6::10]
	list8y = training_Y[7::10]
	list9y = training_Y[8::10]
	list10y = training_Y[9::10]

	##split = training_X.shape[0] * .90
	seed = 1
	train_set_X = numpy.concatenate((list2x,list3x,list4x,list5x,list6x,list7x))
	train_set_Y = numpy.concatenate((list2y,list3y,list4y,list5y,list6y,list7y))
	validate_set_X = list8x
	validate_set_Y = list8y


	full_train_set_X = numpy.concatenate((list2x,list3x,list4x,list5x,list6x,list7x,list8x))
	full_train_set_Y = numpy.concatenate((list2y,list3y,list4y,list5y,list6y,list7y,list8y))
	test_set_X = numpy.concatenate((list1x,list9x,list10x))
	test_set_Y = numpy.concatenate((list1y,list9y,list10y))


	#tables
	dict_choices =defaultdict(float)
	dict_alpha =defaultdict(float)
	dict_beta =defaultdict(float)
	traintimes =defaultdict(float)
	predicttime = defaultdict(float)
	##evuate and find best alpha for each model
	start_time = time.clock()
	dict_alpha0 = optimize_alpha("dc_tree",train_set_X,train_set_Y,validate_set_X,validate_set_Y)
	best_alpha0 = int(argmax(dict_alpha0)[0])
	dict_alpha["dc_tree"] = best_alpha0
	print "\talpha: "+str(best_alpha0)
	traintimes["dc_tree"] = time.clock() - start_time

	start_time = time.clock()
	dict_choices["dc_tree"] = scikit_dc_tree(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha0)
	predicttime["dc_tree"]  = time.clock() - start_time
	print
	
	start_time = time.clock()
	dict_alpha1 = optimize_alpha("knn",train_set_X,train_set_Y,validate_set_X,validate_set_Y)
	best_alpha1 = int(argmax(dict_alpha1)[0])
	dict_alpha["knn"] = best_alpha1
	print "\talpha: "+str(best_alpha1)
	traintimes["knn"] = time.clock() - start_time

	start_time = time.clock()
	dict_choices["knn"] = scikit_knn_model(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha1)
	predicttime["knn"] = time.clock() - start_time
	print
	
	"""
	start_time = time.clock()
	dict_alpha2 = optimize_alpha("log_reg",train_set_X,train_set_Y,validate_set_X,validate_set_Y)
	best_alpha2 = float(argmax(dict_alpha2)[0])
	dict_alpha["log_reg"] = best_alpha2
	print "\talpha: "+str(best_alpha2)
	traintimes["log_reg"] = time.clock() - start_time

	start_time = time.clock()
	dict_choices["log_reg"] = scikit_log_reg(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha2)
	predicttime["log_reg"]  = time.clock() - start_time
	print
	"""

	##ensemble classifiers:

	start_time = time.clock()
	dict_alpha3 = optimize_alpha("ran_forest",train_set_X,train_set_Y,validate_set_X,validate_set_Y)
	best_alpha3 = int(argmax(dict_alpha3)[0])
	best_beta3 = int(argmax(dict_alpha3)[1])
	dict_alpha["ran_forest"] = best_alpha3
	dict_beta["ran_forest"] = best_beta3
	print "\talpha: "+str(best_alpha3)+" beta: "+str(best_beta3)
	traintimes["ran_forest"] = time.clock() - start_time

	start_time = time.clock()
	dict_choices["ran_forest"] = scikit_ran_forest(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha=best_alpha3,beta=best_beta3)
	predicttime["ran_forest"]  = time.clock() - start_time
	print
	"""
	start_time = time.clock()
	dict_alpha4 = optimize_alpha("ada_boost",train_set_X,train_set_Y,validate_set_X,validate_set_Y)
	best_alpha4 = int(argmax(dict_alpha4)[0])
	dict_alpha["ada_boost"] = best_alpha4
	print "\talpha: "+str(best_alpha4)
	traintimes["ada_boost"] = time.clock() - start_time

	start_time = time.clock()
	dict_choices["ada_boost"] = scikit_ada_boost(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha4)
	predicttime["ada_boost"]  = time.clock() - start_time
	print
	"""


	""" SVM extras too slow
	start_time = time.clock()
	dict_alpha2 = optimize_alpha("svm",train_set_X,train_set_Y,validate_set_X,validate_set_Y)
	best_alpha2 = float(argmax(dict_alpha2))
	print "\t"+str(best_alpha2)
	train_time2 = time.clock() - start_time

	start_time = time.clock()
	y_test2 = scikit_onevsrest(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha0)
	predict2 = time.clock() - start_time
	print
	"""


	## consolidate and output final result
	print dict_choices
	final_model = argmax(dict_choices)[0]


	##prep for bar graphs
	dict_errors = dict_choices
	for key, value in dict_errors.iteritems():
		dict_errors[key] = 1- value
	errors = [dict_errors["dc_tree"],dict_errors["knn"],dict_errors["ran_forest"]]
	traintimes = [traintimes["dc_tree"],traintimes["knn"],traintimes["ran_forest"]]
	predicttimes = [predicttime["dc_tree"],predicttime["knn"],predicttime["ran_forest"]]

	##plot accuracy
	plotting.bar_graph(data_title,"decisiontree","knn","ran_forest",[1,2,3],errors,maxy=max(errors)*1.1,ylabel="errors")
	##plot training time
	plotting.bar_graph(data_title,"decisiontree","knn","ran_forest",[1,2,3],traintimes,maxy=max(traintimes)*1.1,ylabel="training times(s)")
	##plot prediction time
	plotting.bar_graph(data_title,"decisiontree","knn","ran_forest",[1,2,3],predicttimes,maxy=max(predicttimes)*1.1,ylabel="prediction times(s)")



	#prep for line graphs
	alpha_X =[]
	alpha_X_ints = []
	for i in range(0, 2, 1):
		for factor in range(3,10,3):
			alpha =  10**(i)*factor;
			if (alpha > 1):
				alpha_X.append(str(alpha))			
				alpha_X_ints.append(alpha)

	alpha_Y_dc = []
	alpha_Y_knn = []
	alpha_Y_ran_forest = []
	dict_error0 = dict_alpha0
	for key, value in dict_alpha0.iteritems():
		dict_error0[key] = 1.0- value 

	dict_error1 = dict_alpha1
	for key, value in dict_alpha1.iteritems():
		dict_error1[key] = 1.0- value 

	dict_error3 = dict_alpha3
	for key, value in dict_alpha3.iteritems():
		dict_error3[key] = 1.0- value 


	for xvalue in alpha_X:
		alpha_Y_dc.append(dict_error0[xvalue])
		alpha_Y_knn.append(dict_error1[xvalue])
		alpha_Y_ran_forest.append(dict_error3[str(int(xvalue)*3)+","+str(50)])

	alpha_Y = alpha_Y_dc,alpha_Y_knn,alpha_Y_ran_forest


	#plot line graph
	plotting.line_graph_alpha_error(data_title,"dc_tree","knn","ran_forest",alpha_X_ints,alpha_Y)

	return final_model,dict_alpha[final_model],best_beta3