예제 #1
0
def qda_classifier(train_XY, priors=None):
    train_X, train_Y = save_XY(train_XY)
    
    qda = QuadraticDiscriminantAnalysis(priors=priors).fit(train_X, train_Y)

    def classify(test_X):
        test_X = to_ndarray(test_X)
        return qda.predict(test_X)

    return classify
예제 #2
0
def random_forest_classifier(train_XY, priors=None):
    train_X, train_Y = save_XY(train_XY)

    forest = RandomForestClassifier(n_jobs=-1).fit(train_X, train_Y)

    def classify(test_X):
        test_X = to_ndarray(test_X)
        return forest.predict(test_X)

    return classify
예제 #3
0
def run_classification(config):
    embeddingconfig, gramconfig, runconfig = config
    label_num = len(gramconfig.skipwords) + 1

    XY = iter_XY(config)

    occurence_nums = [0 for label in range(0, label_num)]
    def occurence(y):
        occurence_nums[y] += 1
    XY = on_the_side(lambda xy: occurence(xy[1]), XY)
    def priors():
        total = sum(occurence_nums)
        return [occurence_num / total for occurence_num in occurence_nums]

    # per_label_test_size = 5000
    test_XY = islice(XY, 5000)
    # test_XY = n_from_each_group(XY, per_label_test_size, key=lambda xy: xy[1], \
    #                             group_labels=[lbl for lbl in range(label_num)])
    test_X, test_Y = save_XY(test_XY)

    train_XY = None
    train_priors = None
    priors_ = priors()
    if runconfig.train_with_priors:
        train_priors = priors_
        train_size_per_label = runconfig.train_size // label_num
        train_XY = n_from_each_group(XY, train_size_per_label, key=lambda xy: xy[1])
    else:
        train_XY = islice(XY, runconfig.train_size)

    classifier = classifiers[runconfig.classifier]
    start_time = time()
    classify = classifier(train_XY, priors=train_priors)
    traintime = time() - start_time
    
    start_time = time()
    response_Y = classify(test_X)
    testtime = time() - start_time

    from_label = partial(label_to_word, gramconfig.skipwords)
    test_Y = list(map(from_label, test_Y))
    response_Y = list(map(from_label, response_Y))
    
    histogram = classification_histogram(test_Y, response_Y)

    result_priors = {from_label(lbl): priors_[lbl] for lbl in range(label_num)}

    result = Result(traintime, testtime, result_priors, histogram)

    return result
예제 #4
0
def logistic_regression_classifier(train_XY, multi_class, priors=None):
    assert priors is None

    train_X, train_Y = save_XY(train_XY)
    
    solver = 'liblinear'
    if multi_class == 'multinomial':
        solver = 'newton-cg'

    lgr = LogisticRegression(multi_class=multi_class, solver=solver, n_jobs=-1).fit(train_X, train_Y)

    def classify(test_X):
        test_X = to_ndarray(test_X)
        return lgr.predict(test_X)

    return classify
예제 #5
0
def nn_classifier(train_XY, k=1, priors=None): 
    # ignore priors
    train_X, train_Y = save_XY(train_XY)

    nn = NearestNeighbors(n_neighbors=k, metric='cosine', algorithm='brute', n_jobs=-1)

    nn = nn.fit(train_X)

    def classify(test_X):
        test_X = to_ndarray(test_X)

        response_Y = []
        for neighbor_indices in nn.kneighbors(test_X, return_distance=False):
            neighbor_labels = [train_Y[index] for index in neighbor_indices]
            best_label = max(set(train_Y), key=lambda label: neighbor_labels.count(label))
            response_Y.append(best_label)
        return to_ndarray(response_Y, dtype=int)

    return classify