def qda_classifier(train_XY, priors=None): train_X, train_Y = save_XY(train_XY) qda = QuadraticDiscriminantAnalysis(priors=priors).fit(train_X, train_Y) def classify(test_X): test_X = to_ndarray(test_X) return qda.predict(test_X) return classify
def random_forest_classifier(train_XY, priors=None): train_X, train_Y = save_XY(train_XY) forest = RandomForestClassifier(n_jobs=-1).fit(train_X, train_Y) def classify(test_X): test_X = to_ndarray(test_X) return forest.predict(test_X) return classify
def run_classification(config): embeddingconfig, gramconfig, runconfig = config label_num = len(gramconfig.skipwords) + 1 XY = iter_XY(config) occurence_nums = [0 for label in range(0, label_num)] def occurence(y): occurence_nums[y] += 1 XY = on_the_side(lambda xy: occurence(xy[1]), XY) def priors(): total = sum(occurence_nums) return [occurence_num / total for occurence_num in occurence_nums] # per_label_test_size = 5000 test_XY = islice(XY, 5000) # test_XY = n_from_each_group(XY, per_label_test_size, key=lambda xy: xy[1], \ # group_labels=[lbl for lbl in range(label_num)]) test_X, test_Y = save_XY(test_XY) train_XY = None train_priors = None priors_ = priors() if runconfig.train_with_priors: train_priors = priors_ train_size_per_label = runconfig.train_size // label_num train_XY = n_from_each_group(XY, train_size_per_label, key=lambda xy: xy[1]) else: train_XY = islice(XY, runconfig.train_size) classifier = classifiers[runconfig.classifier] start_time = time() classify = classifier(train_XY, priors=train_priors) traintime = time() - start_time start_time = time() response_Y = classify(test_X) testtime = time() - start_time from_label = partial(label_to_word, gramconfig.skipwords) test_Y = list(map(from_label, test_Y)) response_Y = list(map(from_label, response_Y)) histogram = classification_histogram(test_Y, response_Y) result_priors = {from_label(lbl): priors_[lbl] for lbl in range(label_num)} result = Result(traintime, testtime, result_priors, histogram) return result
def logistic_regression_classifier(train_XY, multi_class, priors=None): assert priors is None train_X, train_Y = save_XY(train_XY) solver = 'liblinear' if multi_class == 'multinomial': solver = 'newton-cg' lgr = LogisticRegression(multi_class=multi_class, solver=solver, n_jobs=-1).fit(train_X, train_Y) def classify(test_X): test_X = to_ndarray(test_X) return lgr.predict(test_X) return classify
def nn_classifier(train_XY, k=1, priors=None): # ignore priors train_X, train_Y = save_XY(train_XY) nn = NearestNeighbors(n_neighbors=k, metric='cosine', algorithm='brute', n_jobs=-1) nn = nn.fit(train_X) def classify(test_X): test_X = to_ndarray(test_X) response_Y = [] for neighbor_indices in nn.kneighbors(test_X, return_distance=False): neighbor_labels = [train_Y[index] for index in neighbor_indices] best_label = max(set(train_Y), key=lambda label: neighbor_labels.count(label)) response_Y.append(best_label) return to_ndarray(response_Y, dtype=int) return classify