Exemplo n.º 1
0
 def _tree_prune(self, tree, Xval, yval):
     if not isinstance(tree, dict):
         return
     feat = tree.keys()[0]
     for feat_val in tree[feat].keys():
         if isinstance(tree[feat][feat_val], dict):
             subtree = tree[feat][feat_val]
             pred_no_prone = self.predict(Xval)
             perf_no_prone = accuracy_score(yval, pred_no_prone)
             num_leaf_no_prone = self.get_num_leafs(self._parameter['tree'])
             tree[feat][feat_val] = subtree[subtree.keys()
                                            [0]]['__default__']
             pred_with_prone = self.predict(Xval)
             perf_with_prone = accuracy_score(yval, pred_with_prone)
             num_leaf_with_prone = self.get_num_leafs(
                 self._parameter['tree'])
             if perf_no_prone < perf_with_prone:
                 improve = perf_with_prone - perf_no_prone
                 # improve /= num_leaf_no_prone - num_leaf_with_prone - 1
                 self._logger.info(
                     'tree prune, validation precision improve %f' %
                     (improve))
             else:
                 tree[feat][feat_val] = subtree
                 self._tree_prune(subtree, Xval, yval)
Exemplo n.º 2
0
 def _tree_prune(self, tree, X, y):
     if not isinstance(tree, dict):
         return
     feat = tree.keys()[0]
     for feat_val in tree[feat].keys():
         if isinstance(tree[feat][feat_val], dict):
             branch = tree[feat][feat_val]
             self._tree_prune(branch, X, y)
             pred_no_prone = self.predict(X)
             perf_no_prone = accuracy_score(y, pred_no_prone)
             tree[feat][feat_val] = branch[branch.keys()[0]]['__default__']
             pred_with_prone = self.predict(X)
             perf_with_prone = accuracy_score(y, pred_with_prone)
             if perf_no_prone < perf_with_prone:
                 improve = perf_with_prone - perf_no_prone
                 logger.info('tree prune, validation precision improve %f' % (improve))
             else:
                 tree[feat][feat_val] = branch
Exemplo n.º 3
0
 def _tree_prune(self, tree, Xval, yval):
     if not isinstance(tree, dict):
         return
     feat = tree.keys()[0]
     for feat_val in tree[feat].keys():
         if isinstance(tree[feat][feat_val], dict):
             subtree = tree[feat][feat_val]
             pred_no_prone = self.predict(Xval)
             perf_no_prone = accuracy_score(yval, pred_no_prone)
             num_leaf_no_prone = self.get_num_leafs(self._parameter['tree'])
             tree[feat][feat_val] = subtree[subtree.keys()[0]]['__default__']
             pred_with_prone = self.predict(Xval)
             perf_with_prone = accuracy_score(yval, pred_with_prone)
             num_leaf_with_prone = self.get_num_leafs(self._parameter['tree'])
             if perf_no_prone < perf_with_prone:
                 improve = perf_with_prone - perf_no_prone
                 # improve /= num_leaf_no_prone - num_leaf_with_prone - 1
                 self._logger.info('tree prune, validation precision improve %f' % (improve))
             else:
                 tree[feat][feat_val] = subtree
                 self._tree_prune(subtree, Xval, yval)
Exemplo n.º 4
0
            word = sentence[pivot]
            feat = {}
            feat['w'] = word
            for i in xrange(1, self.windows + 1):
                if pivot - i < 0:
                    feat['w-%d' % i] = self.start_symbol
                else:
                    feat['w-%d' % i] = sentence[pivot - i]
            for i in xrange(1, self.windows + 1):
                if pivot + i >= len(sentence):
                    feat['w+%d' % i] = self.end_symbol
                else:
                    feat['w+%d' % i] = sentence[pivot + i]
            features.append(feat)
        return features


if __name__ == '__main__':
    data = treebank.tagged_sents()[:200]
    trainset, testset = data[:180], data[180:]
    model = MaxentTagger(max_iter=20, min_freq=3)
    model.fit(trainset)
    label = []
    for tagged_sent in testset:
        label.extend([v[1] for v in tagged_sent])
    X = []
    for tagged_sent in testset:
        X.append([v[0] for v in tagged_sent])
    pred = model.predict(X)
    print 'test accuracy:', accuracy_score(label, pred)
Exemplo n.º 5
0
                self.plotMidText((self.xOff, self.yOff,), cntrPt, str(key))
        self.yOff += 1.0 / self.totalD

    def createPlot(self):
        inTree = self._parameter['tree']
        fig = plt.figure(1, facecolor='white')
        fig.clf()
        axprops = dict(xticks=[], yticks=[])
        self.decisionNode = dict(boxstyle='sawtooth', fc='0.8')
        self.leafNone = dict(boxstyle='round4', fc='0.8')
        self.arrow_args = dict(arrowstyle='<-')
        self.ax1 = plt.subplot(111, frameon=False, **axprops)
        self.totalW = float(self.get_num_leafs(inTree))
        self.totalD = float(self.get_tree_depth(inTree))
        self.xOff = -0.5 / self.totalW
        self.yOff = 1.0
        self.plotTree(inTree, (0.5, 1.0), '')
        plt.show()


if __name__ == '__main__':
    path = os.getcwd() + '/../dataset/dataset_21_car.arff'
    loader = DataLoader(path)
    dataset = loader.load(target_col_name='class')
    trainset, testset = dataset.cross_split()
    dt = DecisionTreeClassifier(min_split=1, is_prune=False)
    dt.fit(trainset[0], trainset[1])
    predict = dt.predict(testset[0])
    performance = accuracy_score(testset[1], predict)
    print 'test accuracy:', performance
Exemplo n.º 6
0
    scheduler = TimeScheduler()

    # KNN for classification task
    path = os.getcwd() + '/../dataset/electricity-normalized.arff'
    loader = DataLoader(path)
    dataset = loader.load(target_col_name='class')
    trainset, testset = dataset.cross_split()
    knn = KNNClassifier(search_mode='kd_tree')
    knn.fit(trainset[0], trainset[1])
    predict_kd_tree = scheduler.tic_tac('kd_tree', knn.predict, X=testset[0])
    knn = KNNClassifier(search_mode='brutal')
    knn.fit(trainset[0], trainset[1])
    predict_brutal = scheduler.tic_tac('brutal', knn.predict, X=testset[0])
    scheduler.print_task_schedule('brutal')
    scheduler.print_task_schedule('kd_tree')
    print accuracy_score(testset[1], predict_brutal), accuracy_score(testset[1], predict_kd_tree)

    # KNN for regression task
    # path = os.getcwd() + '/../dataset/winequality-white.csv'
    # loader = DataLoader(path)
    # dataset = loader.load(target_col_name='quality')
    # trainset, testset = dataset.cross_split()
    # knn = KNNRegressor(search_mode='brutal')
    # knn.fit(trainset[0], trainset[1])
    # predict_brutal = scheduler.tic_tac('brutal', knn.predict, X=testset[0])
    # knn = KNNRegressor(search_mode='kd_tree')
    # knn.fit(trainset[0], trainset[1])
    # predict_kd_tree = scheduler.tic_tac('kd_tree', knn.predict, X=testset[0])
    # scheduler.print_task_schedule('brutal')
    # scheduler.print_task_schedule('kd_tree')
    # print mean_error(testset[1], predict_brutal), mean_error(testset[1], predict_kd_tree)
Exemplo n.º 7
0
        assert self._is_trained, 'model must be trained before predict.'
        nSize = X.shape[0] if len(X.shape) == 2 else 1
        pred = [np.zeros(self._nClass) for i in xrange(nSize)]
        for model, col_sample_ix in zip(self._parameter['forest'], self._parameter['col_sample_ix']):
            proba = model.predict_proba(X[:, col_sample_ix])
            for i in xrange(nSize):
                ix = np.argmax(proba[i])
                pred[i][ix] += proba[i][ix]
        return np.array(pred)

    def predict(self, X):
        pred_proba = self.predict_proba(X)
        pred = np.argmax(pred_proba, axis=1)
        pred = [self._class_label[i] for i in pred]
        return np.array(pred)


if __name__ == '__main__':
    path = os.getcwd() + '/../dataset/dataset_21_car.arff'
    loader = DataLoader(path)
    dataset = loader.load(target_col_name='class')
    trainset, testset = dataset.cross_split()
    dt = DecisionTreeClassifier()
    dt.fit(trainset[0], trainset[1])
    predict = dt.predict(testset[0])
    print accuracy_score(testset[1], predict)
    rf = RandomForest(100, 0.9)
    rf.fit(trainset[0], trainset[1])
    predict = rf.predict(testset[0])
    print accuracy_score(testset[1], predict)
Exemplo n.º 8
0
        for irow in range(X.shape[0]):
            _X = X[irow]
            max_prob = None
            label = None
            for c in proba_y.keys():
                p = proba_y[c]
                for icol, feat in cond_proba_y[c].iteritems():
                    p += feat[_X[icol]]
                if max_prob < p or max_prob is None:
                    max_prob = p
                    label = c
            assert label is not None, 'label should be None. There must be some error. please check.'
            pred.append(label)
        return np.array(pred)


if __name__ == '__main__':
    path = os.getcwd() + '/../dataset/dataset_21_car.arff'
    loader = DataLoader(path)
    dataset = loader.load(target_col_name='class')
    trainset, testset = dataset.cross_split()
    nb = NaiveBayes()
    nb.fit(trainset[0], trainset[1])
    predict = nb.predict(testset[0])
    acc = accuracy_score(testset[1], predict)
    print acc
    # nb.dump('NB.model')
    # nb = NaiveBayes.load('NB.model')
    # predict = nb.predict(testset[0])
    # print accuracy_score(testset[1], predict)
Exemplo n.º 9
0
        for irow in range(X.shape[0]):
            _X = X[irow]
            max_prob = None
            label = None
            for c in proba_y.keys():
                p = proba_y[c]
                for icol, feat in cond_proba_y[c].iteritems():
                    p += feat[_X[icol]]
                if max_prob < p or max_prob is None:
                    max_prob = p
                    label = c
            assert label is not None, 'label should be None. There must be some error. please check.'
            pred.append(label)
        return np.array(pred)


if __name__ == '__main__':
    path = os.getcwd() + '/../dataset/dataset_21_car.arff'
    loader = DataLoader(path)
    dataset = loader.load(target_col_name='class')
    trainset, testset = dataset.cross_split()
    nb = NaiveBayes()
    nb.fit(trainset[0], trainset[1])
    predict = nb.predict(testset[0])
    acc = accuracy_score(testset[1], predict)
    print acc
    nb.dump('NB.model')
    # nb = NaiveBayes.load('NB.model')
    # predict = nb.predict(testset[0])
    # print accuracy_score(testset[1], predict)
Exemplo n.º 10
0
            return True
        else:
            is_valid = False
            nFeat = X.shape[1]
            nClass = len(np.unique(y))
            if nFeat == self._nFeat and nClass == self._nClass:
                is_valid = True
            return is_valid


if __name__ == '__main__':
    path = os.getcwd() + '/../dataset/dataset_21_car.arff'
    loader = DataLoader(path)
    dataset = loader.load(target_col_name='class')
    trainset, testset = dataset.cross_split()
    nb = NaiveBayes()
    nb.fit(trainset[0], trainset[1])
    p1 = nb.predict(testset[0])
    print 'NaiveBayes accuracy:', accuracy_score(testset[1], p1)
    base_learner = NaiveBayes()
    ada = AdaBoost(base_learner, 100)
    ada.fit(trainset[0], trainset[1])
    prediction = ada.predict(testset[0])
    performance = accuracy_score(testset[1], prediction)
    print 'AdaBoost accuracy:', performance
    # ada.dump('ada.model')
    # ada = AdaBoost.load('ada.model')
    # prediction = ada.predict(testset[0])
    # performance = accuracy_score(testset[1], prediction)
    # print performance
Exemplo n.º 11
0
    scheduler = TimeScheduler()

    # KNN for classification task
    path = os.getcwd() + '/../dataset/electricity-normalized.arff'
    loader = DataLoader(path)
    dataset = loader.load(target_col_name='class')
    trainset, testset = dataset.cross_split()
    knn = KNNClassifier(search_mode='kd_tree')
    knn.fit(trainset[0], trainset[1])
    predict_kd_tree = scheduler.tic_tac('kd_tree', knn.predict, X=testset[0])
    knn = KNNClassifier(search_mode='brutal')
    knn.fit(trainset[0], trainset[1])
    predict_brutal = scheduler.tic_tac('brutal', knn.predict, X=testset[0])
    scheduler.print_task_schedule('brutal')
    scheduler.print_task_schedule('kd_tree')
    print accuracy_score(testset[1], predict_brutal), accuracy_score(
        testset[1], predict_kd_tree)

    # KNN for regression task
    # path = os.getcwd() + '/../dataset/winequality-white.csv'
    # loader = DataLoader(path)
    # dataset = loader.load(target_col_name='quality')
    # trainset, testset = dataset.cross_split()
    # knn = KNNRegressor(search_mode='brutal')
    # knn.fit(trainset[0], trainset[1])
    # predict_brutal = scheduler.tic_tac('brutal', knn.predict, X=testset[0])
    # knn = KNNRegressor(search_mode='kd_tree')
    # knn.fit(trainset[0], trainset[1])
    # predict_kd_tree = scheduler.tic_tac('kd_tree', knn.predict, X=testset[0])
    # scheduler.print_task_schedule('brutal')
    # scheduler.print_task_schedule('kd_tree')
    # print mean_error(testset[1], predict_brutal), mean_error(testset[1], predict_kd_tree)
Exemplo n.º 12
0
        nSize = X.shape[0] if len(X.shape) == 2 else 1
        pred = [np.zeros(self._nClass) for i in xrange(nSize)]
        for model, col_sample_ix in zip(self._parameter['forest'],
                                        self._parameter['col_sample_ix']):
            proba = model.predict_proba(X[:, col_sample_ix])
            for i in xrange(nSize):
                ix = np.argmax(proba[i])
                pred[i][ix] += proba[i][ix]
        return np.array(pred)

    def predict(self, X):
        pred_proba = self.predict_proba(X)
        pred = np.argmax(pred_proba, axis=1)
        pred = [self._class_label[i] for i in pred]
        return np.array(pred)


if __name__ == '__main__':
    path = os.getcwd() + '/../dataset/dataset_21_car.arff'
    loader = DataLoader(path)
    dataset = loader.load(target_col_name='class')
    trainset, testset = dataset.cross_split()
    dt = DecisionTreeClassifier()
    dt.fit(trainset[0], trainset[1])
    predict = dt.predict(testset[0])
    print accuracy_score(testset[1], predict)
    rf = RandomForest(100, 0.9)
    rf.fit(trainset[0], trainset[1])
    predict = rf.predict(testset[0])
    print accuracy_score(testset[1], predict)
Exemplo n.º 13
0
            pred[i] = max_label
        return np.array(pred)

    def __check_valid(self, X, y):
        if self._is_trained is False:
            return True
        else:
            is_valid = False
            nFeat = X.shape[1]
            nClass = len(np.unique(y))
            if nFeat == self._nFeat and nClass == self._nClass:
                is_valid = True
            return is_valid


if __name__ == '__main__':
    path = os.getcwd() + '/../dataset/dataset_21_car.arff'
    loader = DataLoader(path)
    dataset = loader.load(target_col_name='class')
    trainset, testset = dataset.cross_split()
    nb = DecisionTreeClassifier(is_prune=False)
    nb.fit(trainset[0], trainset[1])
    p1 = nb.predict(testset[0])
    print accuracy_score(testset[1], p1)
    nb = DecisionTreeClassifier(is_prune=False)
    ada = AdaBoost(nb, 100)
    ada.fit(trainset[0], trainset[1])
    prediction = ada.predict(testset[0])
    performance = accuracy_score(testset[1], prediction)
    print performance
Exemplo n.º 14
0
                ), cntrPt, str(key))
        self.yOff += 1.0 / self.totalD

    def createPlot(self):
        inTree = self._parameter['tree']
        fig = plt.figure(1, facecolor='white')
        fig.clf()
        axprops = dict(xticks=[], yticks=[])
        self.decisionNode = dict(boxstyle='sawtooth', fc='0.8')
        self.leafNone = dict(boxstyle='round4', fc='0.8')
        self.arrow_args = dict(arrowstyle='<-')
        self.ax1 = plt.subplot(111, frameon=False, **axprops)
        self.totalW = float(self.get_num_leafs(inTree))
        self.totalD = float(self.get_tree_depth(inTree))
        self.xOff = -0.5 / self.totalW
        self.yOff = 1.0
        self.plotTree(inTree, (0.5, 1.0), '')
        plt.show()


if __name__ == '__main__':
    path = os.getcwd() + '/../dataset/dataset_21_car.arff'
    loader = DataLoader(path)
    dataset = loader.load(target_col_name='class')
    trainset, testset = dataset.cross_split()
    dt = DecisionTreeClassifier(min_split=1, is_prune=False)
    dt.fit(trainset[0], trainset[1])
    predict = dt.predict(testset[0])
    performance = accuracy_score(testset[1], predict)
    print 'test accuracy:', performance
Exemplo n.º 15
0
            logger.warning('feature number must be 2.')
            return
        logger.info('start plotting...')
        pred = self._predict(X)
        h = 0.02  # step size in the mesh
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))
        Z = self._predict(np.c_[xx.ravel(), yy.ravel()])
        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        plt.scatter(X[:, 0], X[:, 1], c=pred, cmap=plt.cm.Paired)
        plt.contour(xx, yy, Z, cmap=plt.cm.Paired)
        plt.show()

logger = get_logger(SVM.__name__)

if __name__ == '__main__':
    path = os.getcwd() + '/../dataset/iris.arff'
    loader = DataLoader(path)
    dataset = loader.load(target_col_name='binaryClass')
    trainset, testset = dataset.cross_split()
    X = trainset[0][:, [0, 1]]
    y = trainset[1]
    svm = SVM(kernel_type='rbf', sigma=0.3)
    svm.fit(X, y)
    predict = svm.predict(testset[0][:, [0, 1]])
    print 'test accuracy:', accuracy_score(testset[1], predict)
    svm.plot(X)
Exemplo n.º 16
0
            for i in xrange(nSize):
                ix = np.argmax(proba[i])
                pred[i][ix] += proba[i][ix]
        return np.array(pred)

    def predict(self, X):
        pred_proba = self.predict_proba(X)
        pred = np.argmax(pred_proba, axis=1)
        pred = [self._class_label[i] for i in pred]
        return np.array(pred)


if __name__ == '__main__':
    path = os.getcwd() + '/../dataset/dataset_21_car.arff'
    loader = DataLoader(path)
    dataset = loader.load(target_col_name='class')
    trainset, testset = dataset.cross_split()
    dt = DecisionTreeClassifier()
    dt.fit(trainset[0], trainset[1])
    predict = dt.predict(testset[0])
    print 'DecisionTree accuracy:', accuracy_score(testset[1], predict)
    rf = RandomForest(100, 0.9)
    rf.fit(trainset[0], trainset[1])
    predict = rf.predict(testset[0])
    print 'RandomForest accuracy', accuracy_score(testset[1], predict)
    # rf.dump('rf.model')
    # rf = RandomForest.load('rf.model')
    # predict = rf.predict(testset[0])
    # print 'RandomForest accuracy', accuracy_score(testset[1], predict)