def recursiveFeatureElimination(): with DB() as db: POIs = getPointsOfInterest() numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1 # for hour in xrange(24): plt.figure() plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5) fignum = 1 for POI in POIs: x, y = loadData(db, POI['LAT'], POI['LONG'], generateAllFeatures) x, y = np.array(x), np.array(y) # Create the RFE object and compute a cross-validated score. svr = SVR(kernel="linear") rfecv = RFECV(estimator=svr, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(x, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.subplot(numRows, numCols, fignum) plt.title(POI['NAME']) plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of misclassifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) fignum += 1 plt.show()
def fitPipeline(db, latitude, longitude, generatePipeline): print 'Loading Data' x, y = loadData(db, latitude, longitude, generateAllFeatures) print 'Generating pipeline' pipeline = generatePipeline(x) print 'Training SVR' start = time.clock() pipeline.fit(x, y) print 'Total Training time:', time.clock() - start return pipeline
def plot(generateX, xLabel='x', yLabel='Taxi Pickups', includeFunc=None): with DB() as db: POIs = getPointsOfInterest() numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1 # for hour in xrange(24): plt.figure() plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5) fignum = 1 for POI in POIs: print 'POI', POI x, y = loadData(db, POI['LAT'], POI['LONG'], generateX, includeFunc=includeFunc) plt.subplot(numRows, numCols, fignum) plt.scatter(x, y) plt.title(POI['NAME']) plt.xlabel(xLabel) plt.ylabel(yLabel) fignum += 1 plt.show()
def featureSelection(): with DB() as db: POIs = getPointsOfInterest() numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1 # for hour in xrange(24): plt.figure() plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5) fignum = 1 for POI in POIs: print POI x, y = loadData(db, POI['LAT'], POI['LONG'], generateAllFeaturesExceptWeather) x, y = np.array(x), np.array(y) ############################################################################### width = 0.6 x_indices = np.arange(x.shape[-1]) ############################################################################### # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_regression, percentile=10) selector.fit(x, y) scores = -np.log10(selector.pvalues_) # scores /= scores.max() plt.subplot(numRows, numCols, fignum) plt.bar(x_indices-(width/2), scores, width=width, color='g') plt.title(POI['NAME']) plt.xlabel('Feature number') plt.ylabel('Univariate score ($-Log(p_{value})$)') plt.xticks(x_indices) plt.axis('tight') plt.legend(loc='upper right') fignum += 1 plt.show()