def plot_feature_importances_this(X_train, Y_repeater, Y_numRepeats, Y_quantiles, num_features): """ @param num_features: number of features for [repeater, numrepeats, quantiles] """ fields_repeater = plot_feature_importances(np.array(X_train), Y_repeater, labels=X_train.columns, numTopFeatures=num_features[0], title='Repeater')[1] fields_numRepeats = plot_feature_importances(np.array(X_train), Y_numRepeats, labels=X_train.columns, numTopFeatures=num_features[1], title='Number of Repeats')[1] fields_quantiles = plot_feature_importances(np.array(X_train), Y_quantiles, labels=X_train.columns, numTopFeatures=num_features[2], title='Quantiles of Number of Repeats')[1] return fields_repeater, fields_numRepeats, fields_quantiles
if __name__ == '__main__': x_train, y_train, _, columns_train, weights, y_class = \ process_data('/home/jj/code/Kaggle/Fire/Data/train.csv', impute=True, imputeDataDir='/home/jj/code/Kaggle/Fire/intermediateOutput', imputeStrategy='median', fieldsToUse=['var11', 'var8', 'var13']) # fieldsToUse=FIELDS_CLASS_GBC_TOP100[:30]) # fieldsToUse=FIELDS_CDF_CORR_TOP99[:19]) # y_cdfs = np.array(pandas.read_csv('/home/jj/code/Kaggle/Fire/Data/y_pcdfs.csv')).reshape(NUM_TRAIN_SAMPLES,)[:len(y_train)] # in case smallTrain is used # clf = GradientBoostingRegressor(loss='quantile', learning_rate=0.02, n_estimators=100, subsample=0.9) # clf = LogisticRegression() plot_feature_importances(x_train, np.array(y_train), columns_train, numTopFeatures=3, numEstimators=50) classifier = SVR(kernel='rbf') regressor = Ridge(alpha=1) classFields = fieldNamesToInd(columns_train, FIELDS_CLASS_GBC_TOP100[:20]) regFields = fieldNamesToInd(columns_train, FIELDS_CORR_ORDERED_TOP99[:20]) # clf = GroupThenRegress(list(columns_train).index('var8'), # Ridge(alpha=1, normalize=False), # verbose=1) # clf = SVR() # ================== CORRELATION ================== # print '================== CORRELATION ==================' # print x_train.shape