Пример #1
0
def recursiveFeatureElimination():
	with DB() as db:
		POIs = getPointsOfInterest()
		numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1

		# for hour in xrange(24):
		plt.figure()
		plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5)
		fignum = 1
		for POI in POIs:
			x, y = loadData(db, POI['LAT'], POI['LONG'], generateAllFeatures)
			x, y = np.array(x), np.array(y)

			# Create the RFE object and compute a cross-validated score.
			svr = SVR(kernel="linear")
			rfecv = RFECV(estimator=svr, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy')
			rfecv.fit(x, y)

			print("Optimal number of features : %d" % rfecv.n_features_)

			# Plot number of features VS. cross-validation scores
			plt.subplot(numRows, numCols, fignum)
			plt.title(POI['NAME'])
			plt.xlabel("Number of features selected")
			plt.ylabel("Cross validation score (nb of misclassifications)")
			plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

			fignum += 1
	plt.show()
Пример #2
0
def plot(generateX, xLabel='x', yLabel='Taxi Pickups', includeFunc=None):
	with DB() as db:
		POIs = getPointsOfInterest()
		numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1

		# for hour in xrange(24):
		plt.figure()
		plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5)
		fignum = 1
		for POI in POIs:
			print 'POI', POI
			x, y = loadData(db, POI['LAT'], POI['LONG'], generateX, includeFunc=includeFunc)

			plt.subplot(numRows, numCols, fignum)
			plt.scatter(x, y)
			plt.title(POI['NAME'])
			plt.xlabel(xLabel)
			plt.ylabel(yLabel)

			fignum += 1
	plt.show()
Пример #3
0
def featureSelection():
	with DB() as db:
		POIs = getPointsOfInterest()
		numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1

		# for hour in xrange(24):
		plt.figure()
		plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5)
		fignum = 1
		for POI in POIs:
			print POI
			x, y = loadData(db, POI['LAT'], POI['LONG'], generateAllFeaturesExceptWeather)
			x, y = np.array(x), np.array(y)

			###############################################################################
			width = 0.6

			x_indices = np.arange(x.shape[-1])

			###############################################################################
			# Univariate feature selection with F-test for feature scoring
			# We use the default selection function: the 10% most significant features
			selector = SelectPercentile(f_regression, percentile=10)
			selector.fit(x, y)
			scores = -np.log10(selector.pvalues_)
			# scores /= scores.max()

			plt.subplot(numRows, numCols, fignum)

			plt.bar(x_indices-(width/2), scores, width=width, color='g')
			plt.title(POI['NAME'])
			plt.xlabel('Feature number')
			plt.ylabel('Univariate score ($-Log(p_{value})$)')
			plt.xticks(x_indices)
			plt.axis('tight')
			plt.legend(loc='upper right')

			fignum += 1
	plt.show()
Пример #4
0
		elif prediction < 0:
			numNegatives += 1
		finalPredictions.append(finalPrediction)

	return zip(inputVectors.keys(), finalPredictions), numNegatives


if __name__ == '__main__':
	test_dataset_filename = TEST_DATASET_FINAL_FILENAME if FINAL else TEST_DATASET_INITIAL_FILENAME
	testDataset = loadTestDataset(test_dataset_filename)

	predictions = []
	start = time.clock()
	numNegatives = 0
	with DB() as db:
		for POI in getPointsOfInterest():
			print 'POI', POI
			pipeline = fitPipeline(db, POI['LAT'], POI['LONG'], GENERATE_PIPELINE)
			POIPredictions, POINegatives = predict(db, pipeline, POI['LAT'], POI['LONG'], testDataset)
			predictions.extend(POIPredictions)
			numNegatives += POINegatives

	print 'Predicted a negative number of taxi pickups %i times' % numNegatives

	print 'All predictions took %s seconds' % (time.clock() - start)
	print 'Writing output'
	idList = [False] * len(testDataset)
	outputList = []
	for locID, prediction in predictions:
		outputList.append((locID, '%i %i' % (locID, prediction)))
		idList[locID] = True