예제 #1
0
def single_features():
    train = get_train_dev_data()
    test = get_test_data()

    for i_bal, balanced in enumerate([False, True]):
        for classifier in classifier_names:
            if balanced and classifier in classifiers_not_supporting_balancing:
                continue
            for feature_name in all_feature_names:
                print(feature_name)
                accuracies = []
                for i in range(15):
                    params = SingleBaseParams(feature_name,
                                              classifier,
                                              dirname=sklearn_master_name,
                                              params={'balanced': balanced},
                                              pca=False)
                    a = generate_probabilities(train,
                                               test,
                                               params,
                                               test_only=True)
                    print('\t', i, feature_name, classifier, a)
                    accuracies.append(a * 100)

                l, r = confidence_intervals(accuracies)
                m = float(np.mean([l, r]))
                balanced_str = "balanced" if balanced else "not_balanced"
                results = "%s & %s & %s & %.3f & %.1f-%.1f" % (
                    balanced_str, feature_name, classifier, m, l, r)
                print(results)
                with open("out/conf_int/base/" + feature_name + ".txt",
                          'a') as f:
                    f.write("%s\n" % results)
예제 #2
0
def multiple_features(dirname):
    train = get_train_dev_data()
    test = get_test_data()

    all_subsets = get_feature_name_sets_with_highest_probable_accuracy(dirname)

    n_features = len(all_subsets)
    n_classifiers = len(classifier_names)

    print(n_features)
    max_a = 0.
    for i_bal, balanced in enumerate([False]):
        for ic, classifier in enumerate(classifier_names):
            if balanced and classifier in classifiers_not_supporting_balancing:
                continue
            for ifnl, feature_names_list in enumerate(all_subsets):
                print('Balanced', balanced)

                print('Classifier: ', ic, ' out of ', n_classifiers,
                      classifier)
                print('Features ', ifnl, ' out of ', n_features)
                print('Progress ',
                      (((((i_bal)) * 2 + ic) * n_features) + ifnl) /
                      (n_classifiers * n_features * 2 * 2) * 100., '%')
                accs = []
                for i in range(10):
                    params = MultiBaseParams(features=feature_names_list,
                                             classifier=classifier,
                                             dirname=dirname,
                                             pca=False,
                                             params={'balanced': balanced})
                    a = generate_probabilities(train,
                                               test,
                                               params,
                                               test_only=True)
                    accs.append(a * 100)

                l, r = confidence_intervals(accs)
                m = float(np.mean([l, r]))
                balanced_str = "balanced" if balanced else "not_balanced"
                results = "%s & %s & %s & %.3f & %.1f-%.1f" % (
                    balanced_str, ' '.join(feature_names_list), classifier, m,
                    l, r)
                print(results)
                with open("out/conf_int/" + dirname + "/all.txt", 'a') as f:
                    f.write("%s\n" % results)

                max_a = max(max_a, m)
                print("max accuracy: %0.4f current accuracy %0.4f\n" %
                      (max_a, m))
예제 #3
0
def multiple_features(test_only=False):
    train = get_train_dev_data()
    test = get_test_data()

    all_subsets = get_all_feature_subsets()
    for classifier in classifier_names:
        for subset in all_subsets:
            print(subset)
            params = MultiBaseParams(features=subset,
                                     classifier=classifier,
                                     dirname="new",
                                     params={},
                                     pca=False)
            generate_probabilities(train, test, params, test_only=test_only)
            params.serialize()
예제 #4
0
def single_features():
    train = get_train_dev_data()
    test = get_test_data()

    for classifier in classifiers.keys():
        for feature in all_feature_names:
            print(feature)
            params = SingleBaseParams(feature_name=feature,
                                      classifier=classifier,
                                      dirname="new",
                                      params={},
                                      pca=False)
            run_grid_search(train=train,
                            test=test,
                            params=params,
                            parameters=classifiers[classifier])
예제 #5
0
def single_features(test_only=False):
    train = get_train_dev_data()
    test = get_test_data()

    for classifier in classifier_names:
        for feature in [publication_type_key]:
            print(feature)
            params = SingleBaseParams(feature_name=feature,
                                      classifier=classifier,
                                      dirname="new",
                                      params={},
                                      pca=False)
            generate_probabilities(train=train,
                                   test=test,
                                   params=params,
                                   test_only=test_only)
            params.serialize()
예제 #6
0
def multiple_features():
    train = get_train_dev_data()
    test = get_test_data()

    # all_subsets = get_all_feature_subsets()
    all_subsets = [[publication_type_key, publication_year_key]]
    for classifier in classifiers.keys():
        for subset in all_subsets:
            print(subset)
            params = MultiBaseParams(features=subset,
                                     classifier=classifier,
                                     dirname="new",
                                     params={},
                                     pca=False)
            run_grid_search(train,
                            test,
                            params,
                            parameters=classifiers[classifier])