def performPrediction(allFeatures, allLabels, featureNames): ''' first do PCA ''' selected_features = None ## initialization pcaObj = decomposition.PCA(n_components=pca_comp) pcaObj.fit(allFeatures) # variance of features variance_of_features = pcaObj.explained_variance_ # how much variance is explained each component variance_ratio_of_features = pcaObj.explained_variance_ratio_ totalvarExplained = float(0) for index_ in xrange(len(variance_ratio_of_features)): var_exp_ = variance_ratio_of_features[index_] totalvarExplained = totalvarExplained + var_exp_ print "Prin. comp#{}, ( indi) explained variance:{}, total explained variance:{}".format( index_ + 1, var_exp_, totalvarExplained) no_features_to_use = for_feature_selection print "Of all the features, we will use:", no_features_to_use print "-" * 50 pcaObj.n_components = no_features_to_use selected_features = pcaObj.fit_transform(allFeatures) print "Selected feature dataset size:", np.shape(selected_features) print "-" * 50 printPCAInsights(pcaObj, topComponentCount, featureNames) print "-" * 50 ''' lets start prediction , now that we ahve feature selection otu of the way ''' sklearn_models.performModeling(selected_features, allLabels, 10)
all_features, all_labels) ### use randomized logi. regression to get the features ::: as this performs worse then l1-penalized , it wil not be used # selected_indices_for_features = sklearn_models.getElgiibleFeatures(all_features, all_labels) print "Total selected feature count:", len(selected_indices_for_features) print "The selected feature names: ", Utility.printFeatureName( selected_indices_for_features, True) ##True for enbaling steroid headers print "-" * 50 ### select the features based on feature indicies selected_features = Utility.createSelectedFeatures( all_features, selected_indices_for_features) print "Selected feature dataset size:", np.shape(selected_features) print "Glimpse at selected features (11th entry in label list): \n", selected_features[ glimpseIndex] print "-" * 50 fold2Use = 10 ''' Single iteration zone : turn off 'performIterativeModeling()' while running this ''' # this method runs the classifiers once sklearn_models.performModeling(selected_features, all_labels, fold2Use) print "-" * 50 ''' Multiple iteration zone : turn off 'performModeling()' while running this ''' # this method runs the classifiers 'iteration' number of times # iteration=1000 # sklearn_models.performIterativeModeling(selected_features, all_labels, fold2Use, iteration) # print "-"*50 print "Ended at:", Utility.giveTimeStamp()