def main():
    X, y = get_small_data('TIMESNOW')

    n_features = list()
    for i in range(1,21):
        n_features.append(i*10)
    n_features.append(228)

    clf = RandomForestClassifier(n_estimators=10)
    t0 = time.clock()
    result = do_PCA_and_cv(clf, X.toarray(), y, n_features)
    testTime = time.clock() - t0
    print('Total time: ' + str(testTime))
    scores = result[0]
    scores_std = result[1]

    plt.clf()
    plt.plot(n_features, scores)
    plt.plot(n_features, np.array(scores) + np.array(scores_std), 'b--')
    plt.plot(n_features, np.array(scores) - np.array(scores_std), 'b--')
    locs, labels = plt.yticks()
    plt.yticks(locs, list(map(lambda x: "%g" % x, locs)))
    plt.ylabel('CV score')
    plt.xlabel('Parameter N')
    plt.show()
Пример #2
0
def main():
    X, y = get_small_data('TIMESNOW')

    n_features = list()
    for i in range(1, 21):
        n_features.append(i * 10)
    n_features.append(228)

    clf = RandomForestClassifier(n_estimators=10)
    t0 = time.clock()
    result = do_PCA_and_cv(clf, X.toarray(), y, n_features)
    testTime = time.clock() - t0
    print('Total time: ' + str(testTime))
    scores = result[0]
    scores_std = result[1]

    plt.clf()
    plt.plot(n_features, scores)
    plt.plot(n_features, np.array(scores) + np.array(scores_std), 'b--')
    plt.plot(n_features, np.array(scores) - np.array(scores_std), 'b--')
    locs, labels = plt.yticks()
    plt.yticks(locs, list(map(lambda x: "%g" % x, locs)))
    plt.ylabel('CV score')
    plt.xlabel('Parameter N')
    plt.show()
Пример #3
0
def test_method(clf, name):

    channels = ( 'NDTV', 'TIMESNOW', 'CNNIBN', 'CNN', 'BBC' )

    for channel in channels:
        print('\nProcessing channel {} with method {}'.format(channel, name))
        #f.write('\nProcessing channel {}\n'.format(channel))
        X, y = get_small_data(channel)

        n_features = list()
        for i in range(1,21):
            n_features.append(i*10)
        n_features.append(X.shape[1])

        selector = RandomForestClassifier(n_estimators=60)
        selector.fit(X, y)

        t0 = time.clock()
        result = do_randfor_selection(clf, X.toarray(), y, n_features, selector.feature_importances_)
        testTime = time.clock() - t0
        print('Total time: ' + str(testTime))
        scores = result[0]
        scores_std = result[1]
        times = result[2]

        f = open('RFS-{}-{}.log'.format(name, channel), 'at')
        #f.write('N_features; mean_score; std error; mean training time\n')
        for i in range(0, len(n_features)):
            f.write('{}; {}; {}; {}\n'.format(n_features[i], scores[i], scores_std[i], times[i]))

        f.close()

        plt.clf()
        plt.plot(n_features, scores)
        plt.plot(n_features, np.array(scores) + np.array(scores_std), 'b--')
        plt.plot(n_features, np.array(scores) - np.array(scores_std), 'b--')
        locs, labels = plt.yticks()
        plt.yticks(locs, list(map(lambda x: "%g" % x, locs)))
        plt.ylabel('CV score')
        plt.xlabel('Parameter N')
        #plt.show()
        plt.savefig('{}_{}_r_selection.png'.format(name, channel))

        plt.clf()
        plt.plot(n_features, times)
        locs, labels = plt.yticks()
        plt.yticks(locs, list(map(lambda x: "%g" % x, locs)))
        plt.ylabel('Mean learning time')
        plt.xlabel('Parameter N')
        plt.savefig('{}_{}_r_selection_time.png'.format(name, channel))
def main():
    if len(sys.argv) < 3:
        print('Invalid params. USAGE: param_optimization.py <method> <channel>')
        print('Available methods:\n\tknn, lda, svm, randfor, gtb')
        print('Available channels:\n\tcnn, bbc, ndtv, timesnow, cnnibn')
        sys.exit()

    method_name, channel_name = sys.argv[1], sys.argv[2]

    optimizers = { 'knn': KNNOptimizer, 'lda': LDAOptimizer, 'svm': SVMOptimizer,
            'randfor': RandForestOptimizer, 'gtb': GTBOptimizer }
    X, y = get_small_data(channel_name.upper())
    optimizer = optimizers[method_name]()
    optimizer.optimize(X, y)
    optimizer.log_results(channel_name)
def test_method(clf, name):

    channels = ('NDTV', 'TIMESNOW', 'CNNIBN', 'CNN', 'BBC')
    f = open('PCA_results_{}.txt'.format(name), 'w')

    for channel in channels:
        print('\nProcessing channel {}'.format(channel))
        f.write('\nProcessing channel {}\n'.format(channel))
        X, y = get_small_data(channel)

        n_features = list()
        for i in range(1, 21):
            n_features.append(i * 10)
        n_features.append(227)

        t0 = time.clock()
        result = do_PCA_and_cv(clf, X.toarray(), y, n_features)
        testTime = time.clock() - t0
        print('Total time: ' + str(testTime))
        scores = result[0]
        scores_std = result[1]
        times = result[2]

        f.write('N_features; mean_score; std error; mean training time\n')
        for i in range(0, len(n_features)):
            f.write('{}; {}; {}; {}\n'.format(n_features[i], scores[i],
                                              scores_std[i], times[i]))

        plt.clf()
        plt.plot(n_features, scores)
        plt.plot(n_features, np.array(scores) + np.array(scores_std), 'b--')
        plt.plot(n_features, np.array(scores) - np.array(scores_std), 'b--')
        locs, labels = plt.yticks()
        plt.yticks(locs, list(map(lambda x: "%g" % x, locs)))
        plt.ylabel('CV score')
        plt.xlabel('Parameter N')
        #plt.show()
        plt.savefig('{}_{}_PCA.png'.format(name, channel))

        plt.clf()
        plt.plot(n_features, times)
        locs, labels = plt.yticks()
        plt.yticks(locs, list(map(lambda x: "%g" % x, locs)))
        plt.ylabel('Mean learning time')
        plt.xlabel('Parameter N')
        plt.savefig('{}_{}_PCA_time.png'.format(name, channel))

    f.close()
def main():
    if len(sys.argv) < 3:
        print(
            'Invalid params. USAGE: param_optimization.py <method> <channel>')
        print('Available methods:\n\tknn, lda, svm, randfor, gtb')
        print('Available channels:\n\tcnn, bbc, ndtv, timesnow, cnnibn')
        sys.exit()

    method_name, channel_name = sys.argv[1], sys.argv[2]

    optimizers = {
        'knn': KNNOptimizer,
        'lda': LDAOptimizer,
        'svm': SVMOptimizer,
        'randfor': RandForestOptimizer,
        'gtb': GTBOptimizer
    }
    X, y = get_small_data(channel_name.upper())
    optimizer = optimizers[method_name]()
    optimizer.optimize(X, y)
    optimizer.log_results(channel_name)