示例#1
0
class VotingEnsemble(BaseClassifier):
    def __init__(self, feature_length, num_classes, x=10):

        super().__init__(feature_length, num_classes)

        self.model = VotingClassifier(estimators=[
            ('gba',
             GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=1.0,
                                        max_depth=1,
                                        random_state=0)),
            ('knn',
             KNeighborsClassifier(metric='manhattan',
                                  weights='distance',
                                  n_neighbors=3)),
            ('Nc', NearestCentroid(metric='manhattan')), ('nvb', GaussianNB()),
            ('rf', RandomForestClassifier(n_estimators=10,
                                          criterion='entropy')),
            ('svmlin', svm.SVC(kernel='linear')),
            ('svmpol', svm.SVC(kernel='poly')),
            ('svmrbf', svm.SVC(kernel='rbf'))
        ],
                                      voting='hard')

        self.num_classes = num_classes

    def train(self, features, labels):
        """
        Using a set of features and labels, trains the classifier and returns the training accuracy.
        :param features: An MxN matrix of features to use in prediction
        :param labels: An M row list of labels to train to predict
        :return: Prediction accuracy, as a float between 0 and 1
        """

        labels = self.labels_to_categorical(labels)
        self.model.fit(features, labels)
        accuracy = self.model.score(features, labels)
        return accuracy

    # make sure you save model using the same library as we used in machine learning price-predictor

    def predict(self, features, labels):
        """
        Using a set of features and labels, predicts the labels from the features,
        and returns the accuracy of predicted vs actual labels.
        :param features: An MxN matrix of features to use in prediction
        :param labels: An M row list of labels to test prediction accuracy on
        :return: Prediction accuracy, as a float between 0 and 1
        """
        label_train = self.labels_to_categorical(labels)
        labels = self.model.predict(features)
        accuracy = self.model.score(features, label_train)
        return accuracy

    def get_prediction(self, features):
        return self.model.predict(features)

    def reset(self):
        """
        Resets the trained weights / parameters to initial state
        :return:
        """

        pass

    def labels_to_categorical(self, labels):
        _, IDs = unique(labels, return_inverse=True)
        return IDs
                                 max_depth=4,
                                 random_state=2018)
rf = RandomForestClassifier(1000,
                            criterion='gini',
                            n_jobs=-1,
                            random_state=2018)
lor = LogisticRegression(solver='newton-cg',
                         multi_class='multinomial',
                         max_iter=1000)
clf = VotingClassifier([('gbt', gbt), ('rf', rf), ('lor', lor)],
                       voting='soft',
                       weights=[3, 2, 1],
                       n_jobs=-1)

if args.s:
    clf.fit(X_train, y_train)
    joblib.dump(clf, 'checkpoint/voting.pkl')
    X_test = pd.read_csv("data/test_python.csv", encoding='utf-8')
    pred = clf.predict_proba(X_test)
    np.savetxt('submission/submission.csv',
               np.c_[X_test['listing_id'], pred[:, [2, 1, 0]]],
               delimiter=',',
               header='listing_id,high,medium,low',
               fmt='%d,%.16f,%.16f,%.16f',
               comments='')
else:
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=2018)
    scores = cross_val_score(clf,
                             X_train,
                             y_train,
                             scoring='neg_log_loss',
pipe1 = make_pipeline(ColumnSelector(cols=split1), LinearSVC())
pipe2 = make_pipeline(ColumnSelector(cols=split2), LinearSVC())
pipe3 = make_pipeline(ColumnSelector(cols=split3), LinearSVC())
pipe4 = make_pipeline(ColumnSelector(cols=split4), LinearSVC())
pipe5 = make_pipeline(ColumnSelector(cols=split5), LinearSVC())

# create the ensemble with the votingclassifier
cls = VotingClassifier([
    ('l1', pipe1),
    ('l2', pipe2),
    ('l3', pipe3),
    ('l4', pipe4),
    ('l5', pipe5),
],
                       n_jobs=4)
cls.fit(cars_train_X, cars_train_y)

# uncomment the 3 lines below if needed to see the accuracy and std-dev of the training set
# scores = cross_val_score(cls, cars_train_X, cars_train_y, cv=5, verbose=True)
# print(scores)
# print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# this reaches about 30% acc

# create the predictions and dump to a file for plotting the heatmap
y_pred = cls.predict(cars_test_X)

with open('5subset_linearsvm_voting.sav', 'wb') as f:
    pkl.dump((y_pred, cars_test_y), f)

y_true = cars_test_y
preds = {}