示例#1
0
def feature_selection_trials():
    """
    Select top k features. Vary k and plot data
    """
    global pos, neg, totals, features
    retrain = True

    if not retrain and os.path.isfile(FDATA_FILE):
        pos, neg, totals = cPickle.load(open(FDATA_FILE))
        return

    words = list(set(pos.keys() + neg.keys()))
    print "Total no of features:", len(words)
    words.sort(key=lambda w: -MI(w))
    num_features, accuracy = [], []
    bestk = 0
    limit = 500
    path = "./aclImdb/test/"
    step = 500
    start = 20000
    best_accuracy = 0.0
    for w in words[:start]:
        features.add(w)
    for k in xrange(start, 40000, step):
        for w in words[k:k+step]:
            features.add(w)
        correct = 0
        size = 0

        for file in os.listdir(path + "pos")[:limit]:
            correct += classify(open(path + "pos/" + file).read()) == True
            size += 1

        for file in os.listdir(path + "neg")[:limit]:
            correct += classify(open(path + "neg/" + file).read()) == False
            size += 1

        num_features.append(k+step)
        accuracy.append(correct / size)
        if (correct / size) > best_accuracy:
            bestk = k
        print k+step, correct / size

    features = set(words[:bestk])
    cPickle.dump(get_relevant_features(), open(FDATA_FILE, 'w'))

    pylab.plot(num_features, accuracy)
    pylab.show()
示例#2
0
def feature_selection_experiment(test_set):
    """
    Select top k features. Vary k from 1000 to 50000 and plot data
    """
    keys = positive.keys() + negative.keys()
    sorted_keys = sorted(keys, cmp=lambda x, y: mutual_info(x) > mutual_info(y)) # Sort descending by mutual info
    features = set()
    num_features, accuracy = [], []
    print sorted_keys[-100:]

    for k in xrange(0, 50000, 1000):
        features |= set(sorted_keys[k:k+1000])
        preprocessor = partial(reduce_features, features)
        correct = 0
        for text, label in test_set:
            correct += classify(text) == label
        num_features.append(k+1000)
        accuracy.append(correct / len(test_set))
    print negate_sequence("Is this a good idea")
    print reduce_features(features, "Is this a good idea")

    pylab.plot(num_features, accuracy)
    pylab.show()