def CrossValidationWithSampling(): runObj = run_util.run() ## Load data ((X_train,Y_train),(X_test)) = loadData(runObj) ## Sample train data RSObj=RS.randomSampling() (X_train,Y_train) = RSObj.getRandomSample(X_train,Y_train) (X_train,Y_train) = runObj.randomShuffleData(X_train,Y_train) print('\n Size of X_train : {0}').format(X_train.shape) print('\n Size of Y_train : {0}').format(Y_train.shape) X_train.reset_index(drop=True,inplace=True) Y_train.reset_index(drop=True,inplace=True) ## Cross Validation kf = KFold(X_train.shape[0], n_folds=5) for train_index, test_index in kf: X_train_CV, X_test_CV = X_train.ix[train_index], X_train.ix[test_index] Y_train_CV, Y_test_CV = Y_train.ix[train_index], Y_train.ix[test_index] print('\n Size of X_train_CV : {0}').format(X_train_CV.shape) print('\n Size of Y_train_CV : {0}').format(Y_train_CV.shape) print('\n Size of X_test_CV : {0}').format(X_test_CV.shape) print('\n Size of Y_test_CV : {0}').format(Y_test_CV.shape) ## Run classifiers runObj.runClassifier(X_train_CV,Y_train_CV,X_test_CV,Y_test_CV)
def RFEWithSampling(): runObj = run_util.run() ## Load data ((X_train,Y_train),(X_test)) = loadData(runObj) ## Sample train data RSObj=RS.randomSampling() (X_train,Y_train)=RSObj.getRandomSample(X_train,Y_train) (X_train,Y_train) = runObj.randomShuffleData(X_train,Y_train) print('\n Size of Sample X_train : {0}').format(X_train.shape) print('\n Size of Sample Y_train : {0}').format(Y_train.shape) X_train.reset_index(drop=True,inplace=True) Y_train.reset_index(drop=True,inplace=True) ## RFE Feature Selection print('\n RFE Feature Selection starts...') selected_columns = AF.RFE_featureSelection(X_train,Y_train) print('\n RFE Feature Selection ends...') X_train = X_train[selected_columns] X_test = X_train[selected_columns] ## Split data into 66% train and 33% test ((X_train_S,Y_train_S), (X_test_S,Y_test_S)) = runObj.splitDataset(X_train,Y_train) print('\n Size of X_train_S : {0}').format(X_train_S.shape) print('\n Size of Y_train_S : {0}').format(Y_train_S.shape) print('\n Size of X_test_S : {0}').format(X_test_S.shape) print('\n Size of Y_test_S : {0}').format(Y_test_S.shape) ## Run classifiers runObj.runClassifier(X_train_S,Y_train_S,X_test_S,Y_test_S)
def costValuesBasedPrediction(): runObj = run_util.run() ## Load data ((X_train,Y_train),(X_test)) = loadData(runObj) ## Select feature related to cost costFeatures = ['MMRAcquisitionAuctionAveragePrice','VehBCost'] X_train = X_train[costFeatures] print('\n Size of X_train only cost features : {0}').format(X_train.shape) ## Sample train data RSObj=RS.randomSampling() (X_train,Y_train) = RSObj.getRandomSample(X_train,Y_train) (X_train,Y_train) = runObj.randomShuffleData(X_train,Y_train) print('\n Size of X_train : {0}').format(X_train.shape) print('\n Size of Y_train : {0}').format(Y_train.shape) X_train.reset_index(drop=True,inplace=True) Y_train.reset_index(drop=True,inplace=True) ## Split data into 66% train and 33% test ((X_train_S,Y_train_S), (X_test_S,Y_test_S)) = runObj.splitDataset(X_train,Y_train) print('\n Size of X_train_S : {0}').format(X_train_S.shape) print('\n Size of Y_train_S : {0}').format(Y_train_S.shape) print('\n Size of X_test_S : {0}').format(X_test_S.shape) print('\n Size of Y_test_S : {0}').format(Y_test_S.shape) ## Run classifiers runObj.runClassifier(X_train_S,Y_train_S,X_test_S,Y_test_S)
def RFE_featureSelection(X_train,Y_train): ## Sampling RSObj=randomSampling.randomSampling() (X_train,Y_train)=RSObj.getRandomSample(X_train,Y_train) X_train.reset_index(drop=True,inplace=True) Y_train.reset_index(drop=True,inplace=True) ## Select classifier and parameters logistic = linear_model.LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l1', random_state=None, solver='liblinear', tol=0.01, verbose=0, warm_start=False) ## Initialiaze RFE rfecv = RFECV(estimator=logistic, step=1, cv=5, scoring='recall') ## Fit data rfecv.fit(X_train, Y_train) ## Selected Features print("Optimal number of features : %d" % rfecv.n_features_) ## Plot importance plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() #print('\n Selectd Columns : {0}').format(list(rfecv.support_)) print('\n Selectd Columns : {0}').format(X_train.columns[list(rfecv.support_)]) selected_columns = X_train.columns[list(rfecv.support_)] return selected_columns
def RFE_featureSelection(X_train, Y_train): ## Sampling RSObj = randomSampling.randomSampling() (X_train, Y_train) = RSObj.getRandomSample(X_train, Y_train) X_train.reset_index(drop=True, inplace=True) Y_train.reset_index(drop=True, inplace=True) ## Select classifier and parameters logistic = linear_model.LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l1', random_state=None, solver='liblinear', tol=0.01, verbose=0, warm_start=False) ## Initialiaze RFE rfecv = RFECV(estimator=logistic, step=1, cv=5, scoring='recall') ## Fit data rfecv.fit(X_train, Y_train) ## Selected Features print("Optimal number of features : %d" % rfecv.n_features_) ## Plot importance plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() #print('\n Selectd Columns : {0}').format(list(rfecv.support_)) print('\n Selectd Columns : {0}').format(X_train.columns[list( rfecv.support_)]) selected_columns = X_train.columns[list(rfecv.support_)] return selected_columns
def Boosting_featureSelection(X_train, Y_train): ## Feature selection based on GradientBoostedTrees ## Sampling RSObj = randomSampling.randomSampling() (X_train, Y_train) = RSObj.getRandomSample(X_train, Y_train) X_train.reset_index(drop=True, inplace=True) Y_train.reset_index(drop=True, inplace=True) ## Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=100) ## Fit Forest forest.fit(X_train, Y_train) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] ## Print the feature ranking print("Feature ranking:") cols = list(X_train.columns) for f in range(X_train.shape[1]): print("%d. feature %s (%f)" % (f + 1, cols[indices[f]], importances[indices[f]])) ## Plot the feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(X_train.shape[1]), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(X_train.shape[1]), [cols[i] for i in indices]) plt.xlim([-1, X_train.shape[1]]) plt.show() ## Generate dictionary of column importance cols = [cols[i] for i in indices] dictionary = dict(zip(cols, importances)) return dictionary
def Boosting_featureSelection(X_train,Y_train): ## Feature selection based on GradientBoostedTrees ## Sampling RSObj=randomSampling.randomSampling() (X_train,Y_train)=RSObj.getRandomSample(X_train,Y_train) X_train.reset_index(drop=True,inplace=True) Y_train.reset_index(drop=True,inplace=True) ## Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=100) ## Fit Forest forest.fit(X_train, Y_train) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] ## Print the feature ranking print("Feature ranking:") cols = list(X_train.columns) for f in range(X_train.shape[1]): print("%d. feature %s (%f)" % (f + 1, cols[indices[f]], importances[indices[f]])) ## Plot the feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(X_train.shape[1]), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(X_train.shape[1]), [cols[i] for i in indices]) plt.xlim([-1, X_train.shape[1]]) plt.show() ## Generate dictionary of column importance cols = [cols[i] for i in indices] dictionary = dict(zip(cols,importances)) return dictionary
def BoostingWithSampling(): runObj = run_util.run() ## Load data ((X_train,Y_train),(X_test)) = loadData(runObj) ## Sample train data RSObj=RS.randomSampling() (X_train,Y_train)=RSObj.getRandomSample(X_train,Y_train) (X_train,Y_train) = runObj.randomShuffleData(X_train,Y_train) print('\n Size of X_train : {0}').format(X_train.shape) print('\n Size of Y_train : {0}').format(Y_train.shape) X_train.reset_index(drop=True,inplace=True) Y_train.reset_index(drop=True,inplace=True) ## Boosting Feature Selection print('\n Boosting Feature Selection starts...') selected_columns_dict = AF.Boosting_featureSelection(X_train,Y_train) print('\n Boosting Feature Selection ends...') selected_columns=[] for col, imp in selected_columns_dict.iteritems(): if(imp > 0.005): selected_columns.append(col) print('\n Selected Cols : {0}').format(len(selected_columns)) X_train = X_train[selected_columns] X_test = X_train[selected_columns] ## Split data into 66% train and 33% test ((X_train_S,Y_train_S), (X_test_S,Y_test_S)) = runObj.splitDataset(X_train,Y_train) print('\n Size of X_train_S : {0}').format(X_train_S.shape) print('\n Size of Y_train_S : {0}').format(Y_train_S.shape) print('\n Size of X_test_S : {0}').format(X_test_S.shape) print('\n Size of Y_test_S : {0}').format(Y_test_S.shape) ## Run classifiers runObj.runClassifier(X_train_S,Y_train_S,X_test_S,Y_test_S)
def handpickWithSampling(): runObj = run_util.run() ## Load data ((X_train,Y_train),(X_test)) = loadData(runObj) ## Sample train data RSObj=RS.randomSampling() (X_train,Y_train) = RSObj.getRandomSample(X_train,Y_train) (X_train,Y_train) = runObj.randomShuffleData(X_train,Y_train) print('\n Size of X_train : {0}').format(X_train.shape) print('\n Size of Y_train : {0}').format(Y_train.shape) X_train.reset_index(drop=True,inplace=True) Y_train.reset_index(drop=True,inplace=True) ## Split data into 66% train and 33% test ((X_train_S,Y_train_S), (X_test_S,Y_test_S)) = runObj.splitDataset(X_train,Y_train) print('\n Size of X_train_S : {0}').format(X_train_S.shape) print('\n Size of Y_train_S : {0}').format(Y_train_S.shape) print('\n Size of X_test_S : {0}').format(X_test_S.shape) print('\n Size of Y_test_S : {0}').format(Y_test_S.shape) ## Run classifiers runObj.runClassifier(X_train_S,Y_train_S,X_test_S,Y_test_S)
print('\n predictions : {0}').format(predictions.shape[0]) for i in range(RefId.shape[0]): data.append([RefId[i],predictions[i]]) print('\n Data : \n {0}').format(data) with open(fileName, 'wb') as fp: a = csv.writer(fp, delimiter=',') a.writerows(data) if __name__=="__main__": fileName='updatedTraining.csv' dataLoadObj=dataLoad() Traindataset=dataLoadObj.loadData(fileName,'train') ############ Random Sampling ################### RSObj=RS.randomSampling() Traindataset=RSObj.getRandomSample(Traindataset) print('\n Train : {0}').format(Traindataset.shape) ''' fileName='updatetest.csv' testDataset=dataLoadObj.loadData(fileName,'test') testDataset=testDataset[0:-6] print('\n Test : {0}').format(testDataset.shape) dataLoadObj.runClassifiers(Traindataset,testDataset) ''' print('\n Cross Validation : ') randindices = np.random.randint(0,Traindataset.shape[0],10000) Traindataset=Traindataset[randindices,:] Kobj=Kfold.Kfold(No_of_Folds=10) kf=Kobj.getFoldIndices(Traindataset)