def main(): X, y = get_small_data('TIMESNOW') n_features = list() for i in range(1,21): n_features.append(i*10) n_features.append(228) clf = RandomForestClassifier(n_estimators=10) t0 = time.clock() result = do_PCA_and_cv(clf, X.toarray(), y, n_features) testTime = time.clock() - t0 print('Total time: ' + str(testTime)) scores = result[0] scores_std = result[1] plt.clf() plt.plot(n_features, scores) plt.plot(n_features, np.array(scores) + np.array(scores_std), 'b--') plt.plot(n_features, np.array(scores) - np.array(scores_std), 'b--') locs, labels = plt.yticks() plt.yticks(locs, list(map(lambda x: "%g" % x, locs))) plt.ylabel('CV score') plt.xlabel('Parameter N') plt.show()
def main(): X, y = get_small_data('TIMESNOW') n_features = list() for i in range(1, 21): n_features.append(i * 10) n_features.append(228) clf = RandomForestClassifier(n_estimators=10) t0 = time.clock() result = do_PCA_and_cv(clf, X.toarray(), y, n_features) testTime = time.clock() - t0 print('Total time: ' + str(testTime)) scores = result[0] scores_std = result[1] plt.clf() plt.plot(n_features, scores) plt.plot(n_features, np.array(scores) + np.array(scores_std), 'b--') plt.plot(n_features, np.array(scores) - np.array(scores_std), 'b--') locs, labels = plt.yticks() plt.yticks(locs, list(map(lambda x: "%g" % x, locs))) plt.ylabel('CV score') plt.xlabel('Parameter N') plt.show()
def test_method(clf, name): channels = ( 'NDTV', 'TIMESNOW', 'CNNIBN', 'CNN', 'BBC' ) for channel in channels: print('\nProcessing channel {} with method {}'.format(channel, name)) #f.write('\nProcessing channel {}\n'.format(channel)) X, y = get_small_data(channel) n_features = list() for i in range(1,21): n_features.append(i*10) n_features.append(X.shape[1]) selector = RandomForestClassifier(n_estimators=60) selector.fit(X, y) t0 = time.clock() result = do_randfor_selection(clf, X.toarray(), y, n_features, selector.feature_importances_) testTime = time.clock() - t0 print('Total time: ' + str(testTime)) scores = result[0] scores_std = result[1] times = result[2] f = open('RFS-{}-{}.log'.format(name, channel), 'at') #f.write('N_features; mean_score; std error; mean training time\n') for i in range(0, len(n_features)): f.write('{}; {}; {}; {}\n'.format(n_features[i], scores[i], scores_std[i], times[i])) f.close() plt.clf() plt.plot(n_features, scores) plt.plot(n_features, np.array(scores) + np.array(scores_std), 'b--') plt.plot(n_features, np.array(scores) - np.array(scores_std), 'b--') locs, labels = plt.yticks() plt.yticks(locs, list(map(lambda x: "%g" % x, locs))) plt.ylabel('CV score') plt.xlabel('Parameter N') #plt.show() plt.savefig('{}_{}_r_selection.png'.format(name, channel)) plt.clf() plt.plot(n_features, times) locs, labels = plt.yticks() plt.yticks(locs, list(map(lambda x: "%g" % x, locs))) plt.ylabel('Mean learning time') plt.xlabel('Parameter N') plt.savefig('{}_{}_r_selection_time.png'.format(name, channel))
def main(): if len(sys.argv) < 3: print('Invalid params. USAGE: param_optimization.py <method> <channel>') print('Available methods:\n\tknn, lda, svm, randfor, gtb') print('Available channels:\n\tcnn, bbc, ndtv, timesnow, cnnibn') sys.exit() method_name, channel_name = sys.argv[1], sys.argv[2] optimizers = { 'knn': KNNOptimizer, 'lda': LDAOptimizer, 'svm': SVMOptimizer, 'randfor': RandForestOptimizer, 'gtb': GTBOptimizer } X, y = get_small_data(channel_name.upper()) optimizer = optimizers[method_name]() optimizer.optimize(X, y) optimizer.log_results(channel_name)
def test_method(clf, name): channels = ('NDTV', 'TIMESNOW', 'CNNIBN', 'CNN', 'BBC') f = open('PCA_results_{}.txt'.format(name), 'w') for channel in channels: print('\nProcessing channel {}'.format(channel)) f.write('\nProcessing channel {}\n'.format(channel)) X, y = get_small_data(channel) n_features = list() for i in range(1, 21): n_features.append(i * 10) n_features.append(227) t0 = time.clock() result = do_PCA_and_cv(clf, X.toarray(), y, n_features) testTime = time.clock() - t0 print('Total time: ' + str(testTime)) scores = result[0] scores_std = result[1] times = result[2] f.write('N_features; mean_score; std error; mean training time\n') for i in range(0, len(n_features)): f.write('{}; {}; {}; {}\n'.format(n_features[i], scores[i], scores_std[i], times[i])) plt.clf() plt.plot(n_features, scores) plt.plot(n_features, np.array(scores) + np.array(scores_std), 'b--') plt.plot(n_features, np.array(scores) - np.array(scores_std), 'b--') locs, labels = plt.yticks() plt.yticks(locs, list(map(lambda x: "%g" % x, locs))) plt.ylabel('CV score') plt.xlabel('Parameter N') #plt.show() plt.savefig('{}_{}_PCA.png'.format(name, channel)) plt.clf() plt.plot(n_features, times) locs, labels = plt.yticks() plt.yticks(locs, list(map(lambda x: "%g" % x, locs))) plt.ylabel('Mean learning time') plt.xlabel('Parameter N') plt.savefig('{}_{}_PCA_time.png'.format(name, channel)) f.close()
def main(): if len(sys.argv) < 3: print( 'Invalid params. USAGE: param_optimization.py <method> <channel>') print('Available methods:\n\tknn, lda, svm, randfor, gtb') print('Available channels:\n\tcnn, bbc, ndtv, timesnow, cnnibn') sys.exit() method_name, channel_name = sys.argv[1], sys.argv[2] optimizers = { 'knn': KNNOptimizer, 'lda': LDAOptimizer, 'svm': SVMOptimizer, 'randfor': RandForestOptimizer, 'gtb': GTBOptimizer } X, y = get_small_data(channel_name.upper()) optimizer = optimizers[method_name]() optimizer.optimize(X, y) optimizer.log_results(channel_name)