def create_timing_curve(estimator, dataset, data_name, clf_name): """Generates a timing curve for the specified estimator, saves tabular results to CSV and saves a plot of the timing curve. Args: estimator (object): Target classifier. dataset(pandas.DataFrame): Source data set. data_name (str): Name of data set being tested. clf_name (str): Type of algorithm. """ # set training sizes and intervals train_sizes = np.arange(0.1, 1.0, 0.05) # initialise variables train_time = [] predict_time = [] df_final = [] # iterate through training sizes and capture training and predict times for i, train_data in enumerate(train_sizes): X_train, X_test, y_train, y_test = split_data(dataset, test_size=1 - train_data) start_train = timeit.default_timer() estimator.fit(X_train, y_train) end_train = timeit.default_timer() estimator.predict(X_test) end_predict = timeit.default_timer() train_time.append(end_train - start_train) predict_time.append(end_predict - end_train) df_final.append([train_data, train_time[i], predict_time[i]]) # save timing results to CSV timedata = pd.DataFrame( data=df_final, columns=['Training Data Percentage', 'Train Time', 'Test Time'], ) resdir = 'results' res_tgt = '{}/{}'.format(resdir, clf_name) timefile = get_abspath('{}_timing_curve.csv'.format(data_name), res_tgt) timedata.to_csv(timefile, index=False) # generate timing curve plot plt.figure(2) plt.plot(train_sizes, train_time, marker='.', color='b', label='Train') plt.plot(train_sizes, predict_time, marker='.', color='g', label='Predict') plt.legend(loc='best') plt.grid(linestyle='dotted') plt.xlabel('Samples used for training as a percentage of total') plt.ylabel('Elapsed user time in seconds') # save timing curve plot as PNG plotdir = 'plots' plt.title("Timing Curve with {} on {}".format(clf_name, data_name)) plot_tgt = '{}/{}'.format(plotdir, clf_name) plotpath = get_abspath('{}_TC.png'.format(data_name), plot_tgt) plt.savefig(plotpath) plt.close()
def create_timing_curve(estimator, dataset, data_name, clf_name): # set training sizes and intervals train_sizes = np.arange(0.01, 1.0, 0.03) # initialise variables train_time = [] predict_time = [] df_final = [] # iterate through training sizes and capture training and predict times for i, train_data in enumerate(train_sizes): X_train, X_test, y_train, y_test = split_data(dataset, test_size=1 - train_data) start_train = timeit.default_timer() estimator.fit(X_train, y_train) end_train = timeit.default_timer() estimator.predict(X_test) end_predict = timeit.default_timer() train_time.append(end_train - start_train) predict_time.append(end_predict - end_train) df_final.append([train_data, train_time[i], predict_time[i]]) # save timing results to CSV timedata = pd.DataFrame( data=df_final, columns=['Training Data Percentage', 'Train Time', 'Test Time']) resdir = 'results' res_tgt = '{}/{}'.format(resdir, clf_name) timefile = get_abspath('{}_timing_curve.csv'.format(data_name), res_tgt) timedata.to_csv(timefile, index=False) # generate timing curve plot plt.figure() plt.title("Timing Curve ({})".format(data_name)) plt.grid() plt.plot(train_sizes, train_time, marker='.', color='y', label='Train') plt.plot(train_sizes, predict_time, marker='.', color='dodgerblue', label='Predict') plt.legend(loc='best') plt.xlabel('Training Set Size (%)') plt.ylabel('Elapsed user time in seconds') # save timing curve plot as PNG plotdir = 'plots' plot_tgt = '{}/{}'.format(plotdir, clf_name) plotpath = get_abspath('{}_TC.png'.format(data_name), plot_tgt) plt.savefig(plotpath) plt.close()
# estimators with iteration param iterators = {'Boosting': 'ADA__n_estimators', 'ANN': 'MLP__max_iter'} # validation curve parameter names and ranges vc_params = { 'DT': ('DT__max_depth', np.arange(1, 50, 1)), 'KNN': ('KNN__n_neighbors', np.arange(1, 50, 1)), 'ANN': ('MLP__hidden_layer_sizes', np.arange(1, 50, 5)), 'SVM_RBF': ('SVMR__gamma', np.logspace(-9, -1, 15)), 'SVM_PLY': ('SVMP__gamma', np.logspace(-9, -1, 20)), 'Boosting': ('ADA__n_estimators', np.arange(20, 200, 10)) } # start model evaluation loop for df in dnames: X_train, X_test, y_train, y_test = split_data(dfs[df]) # load pickled models into estimators dict for m in mnames: mfile = '{}\{}_grid.pkl'.format(m, df) model = load_pickled_model(get_abspath(mfile, filepath='models')) estimators[m] = model log_cols2 = [ "Train Set Accuracy", "Cross-Validation Score", "Test Set Accuracy" ] log2 = pd.DataFrame(columns=log_cols2) # generate validation, learning, and timing curves for name, estimator in estimators.iteritems():
import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import os import seaborn as sns # Importing the dataset p_abalone = get_abspath('abalone.csv', 'data/experiments') p_abalone2 = get_abspath('abalone-2.csv', 'data/experiments') p_seismic = get_abspath('seismic-bumps.csv', 'data/experiments') df_abalone = pd.read_csv(p_abalone) df_abalone2 = pd.read_csv(p_abalone2) df_seismic = pd.read_csv(p_seismic) # Splitting the dataset into the Training set and Test set X_train, X_test, y_train, y_test = split_data(df_abalone2) #dataset = pd.read_csv('Social_Network_Ads.csv') #X = dataset.iloc[:, [2, 3]].values #y = dataset.iloc[:, 4].values # Splitting the dataset into the Training set and Test set #from sklearn.cross_validation import train_test_split #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting Decision Tree Classification to the Training set