dtree = DecisionTreeClassifier(max_depth=1)
    """
                base_estimator=None,  子模型类型
                 n_estimators=50, 子模型个数
                 learning_rate=1., 学习步长,缩放因子
                 algorithm='SAMME.R',  
                 random_state=None):

    """
    algo = AdaBoostClassifier(base_estimator=dtree, n_estimators=10)
    # 模型训练
    algo.fit(X_train, y_train)
    # 模型效果评估
    print('训练集上的准确率:{}'.format(algo.score(X_train, y_train)))
    print('测试集上的准确率:{}'.format(algo.score(X_test, y_test)))

    x_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 1.4, 0.2]]
    print('样本预测值:')
    print(algo.predict(x_test))
    print("样本的预测概率值:")
    print(algo.predict_proba(x_test))
    print("样本的预测概率值的Log转换值:")
    print(algo.predict_log_proba(x_test))

    print("训练好的所有子模型:\n{}".format(algo.estimators_))
    x_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 2.9, 0.8]]
    generator = algo.staged_predict(x_test)
    print('阶段预测值:')
    for i in generator:
        print(i)
    print('各特征属性权重列表:{}'.format(algo.feature_importances_))
예제 #2
0
def test_sparse_classification():
    # Check classification with sparse input.

    class CustomSVC(SVC):
        """SVC variant that records the nature of the training set."""
        def fit(self, X, y, sample_weight=None):
            """Modification on fit caries data type for later verification."""
            super(CustomSVC, self).fit(X, y, sample_weight=sample_weight)
            self.data_type_ = type(X)
            return self

    X, y = datasets.make_multilabel_classification(n_classes=1,
                                                   n_samples=15,
                                                   n_features=5,
                                                   random_state=42)
    # Flatten y to a 1d array
    y = np.ravel(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for sparse_format in [
            csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix
    ]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        # Trained on sparse format
        sparse_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME").fit(X_train_sparse, y_train)

        # Trained on dense format
        dense_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME").fit(X_train, y_train)

        # predict
        sparse_results = sparse_classifier.predict(X_test_sparse)
        dense_results = dense_classifier.predict(X_test)
        assert_array_equal(sparse_results, dense_results)

        # decision_function
        sparse_results = sparse_classifier.decision_function(X_test_sparse)
        dense_results = dense_classifier.decision_function(X_test)
        assert_array_equal(sparse_results, dense_results)

        # predict_log_proba
        sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
        dense_results = dense_classifier.predict_log_proba(X_test)
        assert_array_equal(sparse_results, dense_results)

        # predict_proba
        sparse_results = sparse_classifier.predict_proba(X_test_sparse)
        dense_results = dense_classifier.predict_proba(X_test)
        assert_array_equal(sparse_results, dense_results)

        # score
        sparse_results = sparse_classifier.score(X_test_sparse, y_test)
        dense_results = dense_classifier.score(X_test, y_test)
        assert_array_equal(sparse_results, dense_results)

        # staged_decision_function
        sparse_results = sparse_classifier.staged_decision_function(
            X_test_sparse)
        dense_results = dense_classifier.staged_decision_function(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict
        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
        dense_results = dense_classifier.staged_predict(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict_proba
        sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
        dense_results = dense_classifier.staged_predict_proba(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_score
        sparse_results = sparse_classifier.staged_score(X_test_sparse, y_test)
        dense_results = dense_classifier.staged_score(X_test, y_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # Verify sparsity of data is maintained during training
        types = [i.data_type_ for i in sparse_classifier.estimators_]

        assert all([(t == csc_matrix or t == csr_matrix) for t in types])
def test_sparse_classification():
    # Check classification with sparse input.

    class CustomSVC(SVC):
        """SVC variant that records the nature of the training set."""

        def fit(self, X, y, sample_weight=None):
            """Modification on fit caries data type for later verification."""
            super(CustomSVC, self).fit(X, y, sample_weight=sample_weight)
            self.data_type_ = type(X)
            return self

    X, y = datasets.make_multilabel_classification(n_classes=1, n_samples=15,
                                                   n_features=5,
                                                   random_state=42)
    # Flatten y to a 1d array
    y = np.ravel(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix,
                          dok_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        # Trained on sparse format
        sparse_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME"
        ).fit(X_train_sparse, y_train)

        # Trained on dense format
        dense_classifier = AdaBoostClassifier(
            base_estimator=CustomSVC(probability=True),
            random_state=1,
            algorithm="SAMME"
        ).fit(X_train, y_train)

        # predict
        sparse_results = sparse_classifier.predict(X_test_sparse)
        dense_results = dense_classifier.predict(X_test)
        assert_array_equal(sparse_results, dense_results)

        # decision_function
        sparse_results = sparse_classifier.decision_function(X_test_sparse)
        dense_results = dense_classifier.decision_function(X_test)
        assert_array_equal(sparse_results, dense_results)

        # predict_log_proba
        sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
        dense_results = dense_classifier.predict_log_proba(X_test)
        assert_array_equal(sparse_results, dense_results)

        # predict_proba
        sparse_results = sparse_classifier.predict_proba(X_test_sparse)
        dense_results = dense_classifier.predict_proba(X_test)
        assert_array_equal(sparse_results, dense_results)

        # score
        sparse_results = sparse_classifier.score(X_test_sparse, y_test)
        dense_results = dense_classifier.score(X_test, y_test)
        assert_array_equal(sparse_results, dense_results)

        # staged_decision_function
        sparse_results = sparse_classifier.staged_decision_function(
            X_test_sparse)
        dense_results = dense_classifier.staged_decision_function(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict
        sparse_results = sparse_classifier.staged_predict(X_test_sparse)
        dense_results = dense_classifier.staged_predict(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_predict_proba
        sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
        dense_results = dense_classifier.staged_predict_proba(X_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # staged_score
        sparse_results = sparse_classifier.staged_score(X_test_sparse,
                                                        y_test)
        dense_results = dense_classifier.staged_score(X_test, y_test)
        for sprase_res, dense_res in zip(sparse_results, dense_results):
            assert_array_equal(sprase_res, dense_res)

        # Verify sparsity of data is maintained during training
        types = [i.data_type_ for i in sparse_classifier.estimators_]

        assert all([(t == csc_matrix or t == csr_matrix)
                   for t in types])
예제 #4
0
class _AdaBoostClassifierImpl:
    def __init__(
        self,
        base_estimator=None,
        *,
        n_estimators=50,
        learning_rate=1.0,
        algorithm="SAMME.R",
        random_state=None,
    ):
        if base_estimator is None:
            estimator_impl = None
        else:
            estimator_impl = _FitSpecProxy(base_estimator)

        self._hyperparams = {
            "base_estimator": estimator_impl,
            "n_estimators": n_estimators,
            "learning_rate": learning_rate,
            "algorithm": algorithm,
            "random_state": random_state,
        }
        self._wrapped_model = SKLModel(**self._hyperparams)
        self._hyperparams["base_estimator"] = base_estimator

    def get_params(self, deep=True):
        out = self._wrapped_model.get_params(deep=deep)
        # we want to return the lale operator, not the underlying impl
        out["base_estimator"] = self._hyperparams["base_estimator"]
        return out

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            feature_transformer = FunctionTransformer(
                func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns),
                inverse_func=None,
                check_inverse=False,
            )
            self._hyperparams["base_estimator"] = _FitSpecProxy(
                feature_transformer >> self._hyperparams["base_estimator"])
            self._wrapped_model = SKLModel(**self._hyperparams)
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def predict_log_proba(self, X):
        return self._wrapped_model.predict_log_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)

    def score(self, X, y, sample_weight=None):
        return self._wrapped_model.score(X, y, sample_weight)
예제 #5
0
class PCR:
    def __init__(self):
        self.__clustersNumber = CLUSTERS_NUMBER
        self.__queue = Queue()
        self.__verbose = VERBOSE
        self.__useCache = USE_CACHE

        for i in range(FILE_LOAD_THREADS):
            t = Thread(target=self.__worker)
            t.daemon = True
            t.start()

        self.__kmeans = MiniBatchKMeans(n_clusters=self.__clustersNumber,
                                        random_state=CLUSTER_SEED,
                                        verbose=self.__verbose)
        self.__tfidf = TfidfTransformer()
        self.__tfidf1 = TfidfTransformer()

        self.__clf = AdaBoostClassifier(MultinomialNB(alpha=BAYES_ALPHA),
                                        n_estimators=ADA_BOOST_ESTIMATORS)
        self.__clf1 = AdaBoostClassifier(MultinomialNB(alpha=BAYES_ALPHA),
                                         n_estimators=ADA_BOOST_ESTIMATORS)

    def __worker(self):
        while True:
            task = self.__queue.get()
            func, args = task
            try:
                func(args)
            except Exception as e:
                print 'EXCEPTION:', e
            self.__queue.task_done()

    def train(self, positiveFiles, negativeFiles):
        cachedData = self.__loadCache()
        if cachedData is None:
            self.__log('loading positives')
            positiveSamples = self.__loadSamples(positiveFiles)
            self.__log('loading negatives')
            negativeSamples = self.__loadSamples(negativeFiles)

            totalDescriptors = []
            self.__addDescriptors(totalDescriptors, positiveSamples)
            self.__addDescriptors(totalDescriptors, negativeSamples)

            self.__kmeans.fit(totalDescriptors)
            clusters = self.__kmeans.predict(totalDescriptors)

            self.__printDistribution(clusters)
            self.__saveCache(
                (positiveSamples, negativeSamples, self.__kmeans, clusters))
        else:
            self.__log('using cache')
            positiveSamples, negativeSamples, self.__kmeans, clusters = cachedData

        totalSamplesNumber = len(negativeSamples) + len(positiveSamples)
        counts = lil_matrix((totalSamplesNumber, self.__clustersNumber))
        counts1 = lil_matrix((totalSamplesNumber, 256))
        self.__currentSample = 0
        self.__currentDescr = 0
        self.__calculteCounts(positiveSamples, counts, counts1, clusters)
        self.__calculteCounts(negativeSamples, counts, counts1, clusters)
        counts = csr_matrix(counts)
        counts1 = csr_matrix(counts1)

        self.__log('training bayes classifier')
        tfidf = self.__tfidf.fit_transform(counts)
        tfidf1 = self.__tfidf1.fit_transform(counts1)
        classes = [True] * len(positiveSamples) + [False
                                                   ] * len(negativeSamples)
        self.__clf.fit(tfidf, classes)
        self.__clf1.fit(tfidf1, classes)

        self.__log('training complete')

    def predict(self, files):
        self.__log('loading files')
        samples = self.__loadSamples(files)
        totalDescriptors = []
        self.__addDescriptors(totalDescriptors, samples)
        self.__log('predicting classes')
        clusters = self.__kmeans.predict(totalDescriptors)
        counts = lil_matrix((len(samples), self.__clustersNumber))
        counts1 = lil_matrix((len(samples), 256))
        self.__currentSample = 0
        self.__currentDescr = 0
        self.__calculteCounts(samples, counts, counts1, clusters)
        counts = csr_matrix(counts)
        counts1 = csr_matrix(counts1)

        tfidf = self.__tfidf.transform(counts)
        tfidf1 = self.__tfidf1.transform(counts1)

        self.__log('classifying')

        weights = self.__clf.predict_log_proba(tfidf.toarray())
        weights1 = self.__clf1.predict_log_proba(tfidf1.toarray())
        predictions = []
        for i in xrange(0, len(weights)):
            w = weights[i][0] - weights[i][1]
            w1 = weights1[i][0] - weights1[i][1]

            pred = w < 0
            pred1 = w1 < 0

            if pred != pred1:
                pred = w + w1 < 0

            predictions.append(pred)

        self.__log('prediction complete')
        return predictions

    def saveModel(self, fileName):
        data = pickle.dumps(
            (self.__clustersNumber, self.__kmeans, self.__tfidf, self.__tfidf1,
             self.__clf, self.__clf1), -1)
        data = zlib.compress(data)
        open(fileName, 'w').write(data)

    def loadModel(self, fileName):
        data = open(fileName, 'r').read()
        data = zlib.decompress(data)
        data = pickle.loads(data)
        self.__clustersNumber, self.__kmeans, self.__tfidf, self.__tfidf1, self.__clf, self.__clf1 = data

    def __log(self, message):
        if self.__verbose:
            print message

    def __saveCache(self, data):
        if not self.__useCache:
            return
        data = pickle.dumps(data, -1)
        data = zlib.compress(data)
        open('cache.bin', 'w').write(data)

    def __loadCache(self):
        if not self.__useCache:
            return None
        if not os.path.isfile('cache.bin'):
            return None
        data = open('cache.bin', 'r').read()
        data = zlib.decompress(data)
        data = pickle.loads(data)
        return data

    def __calculteCounts(self, samples, counts, counts1, clusters):
        cn = self.__clustersNumber
        for s in samples:
            currentCounts = {}
            for d in s[0]:
                currentCounts[clusters[
                    self.__currentDescr]] = currentCounts.get(
                        clusters[self.__currentDescr], 0) + 1
                self.__currentDescr += 1
            for clu, cnt in currentCounts.iteritems():
                counts[self.__currentSample, clu] = cnt
            for i, histCnt in enumerate(s[1]):
                counts1[self.__currentSample, i] = histCnt[0]
            self.__currentSample += 1

    def __printDistribution(self, clusters):
        if not self.__verbose:
            return
        distr = {}
        for c in clusters:
            distr[c] = distr.get(c, 0) + 1
        v = sorted(distr.values(), reverse=True)
        print 'distribution:', v[0:15], '...', v[-15:]

    def __addDescriptors(self, totalDescriptors, samples):
        for sample in samples:
            for descriptor in sample[0]:
                totalDescriptors.append(descriptor)

    def __loadSamples(self, files):
        samples = [[]] * len(files)
        n = 0
        for f in files:
            self.__queue.put((self.__loadSingleSample, (f, samples, n)))
            n += 1
        self.__queue.join()
        if _g_removed:
            print ' === REMOVED = TERMINATE'
            sys.exit(44)
        return samples

    def __loadSingleSample(self, args):
        global _g_removed
        fileName, samples, sampleNum = args
        des, hist = self.__getFeatures(fileName)
        if des is None:
            print 'ERROR: failed to load', fileName
            os.remove(fileName)
            _g_removed = True
            #sys.exit(44)
            des = []
            hist = [[0]] * 256
        samples[sampleNum] = (des, hist)

    def __getFeatures(self, fileName):
        fid = 'cache/' + str(zlib.crc32(fileName))
        self.__log('loading %s' % fileName)
        if os.path.isfile(fid):
            des, hist = pickle.loads(open(fid, 'rb').read())
        else:
            img = cv2.imread(fileName)

            if img.shape[1] > 1000:
                cf = 1000.0 / img.shape[1]
                newSize = (int(cf * img.shape[0]), int(cf * img.shape[1]),
                           img.shape[2])
                img.resize(newSize)

            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

            s = cv2.SIFT(nfeatures=400)

            d = cv2.DescriptorExtractor_create("OpponentSIFT")
            kp = s.detect(gray, None)
            kp, des = d.compute(img, kp)

            hist = self.__getColorHist(img)

            #open(fid, 'wb').write(pickle.dumps((des, hist), -1))

        return des, hist

    def __getColorHist(self, img):
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        dist = cv2.calcHist([hsv], [0], None, [256], [0, 256])
        return dist
    testSamples = makeSamples(testFiles)

testDescriptors = []
addDescriptors(testDescriptors, testSamples)

testClusters = kmeans.predict(testDescriptors)
testCounts = lil_matrix((len(testSamples), CLUSTERS_NUMBER))
testCounts1 = lil_matrix((len(testSamples), 256))
calculteCounts(testSamples, testCounts, testCounts1, testClusters)
testCounts = csr_matrix(testCounts)
testCounts1 = csr_matrix(testCounts1)

_tfidf = tfidf.transform(testCounts)
_tfidf1 = tfidf1.transform(testCounts1)

weights = clf.predict_log_proba(_tfidf)
weights1 = clf1.predict_log_proba(_tfidf1)
predictions = []
for i in xrange(0, len(weights)):
  w = weights[i][0] - weights[i][1]
  w1 = weights1[i][0] - weights1[i][1]
  pred = w < 0
  pred1 = w1 < 0
  if pred != pred1:
    pred = w + w1 < 0
  predictions.append(pred)

match = 0
dismatch = 0
if len(testFiles) == len(predictions):
    log = open('log.txt', 'w')
예제 #7
0
파일: pcr.py 프로젝트: bakwc/PornDetector
class PCR:

    def __init__(self):
        self.__clustersNumber = CLUSTERS_NUMBER
        self.__queue = Queue()
        self.__verbose = VERBOSE
        self.__useCache = USE_CACHE

        for i in range(FILE_LOAD_THREADS):
            t = Thread(target=self.__worker)
            t.daemon = True
            t.start()

        self.__kmeans = MiniBatchKMeans(
            n_clusters=self.__clustersNumber,
            random_state=CLUSTER_SEED,
            verbose=self.__verbose)
        self.__tfidf = TfidfTransformer()
        self.__tfidf1 = TfidfTransformer()

        self.__clf = AdaBoostClassifier(MultinomialNB(alpha=BAYES_ALPHA), n_estimators=ADA_BOOST_ESTIMATORS)
        self.__clf1 = AdaBoostClassifier(MultinomialNB(alpha=BAYES_ALPHA), n_estimators=ADA_BOOST_ESTIMATORS)

    def __worker(self):
        while True:
            task = self.__queue.get()
            func, args = task
            try:
                func(args)
            except Exception as e:
                print('EXCEPTION:', e)
            self.__queue.task_done()

    def train(self, positiveFiles, negativeFiles):
        cachedData = self.__loadCache()
        if cachedData is None:
            self.__log('loading positives')
            positiveSamples = self.__loadSamples(positiveFiles)
            self.__log('loading negatives')
            negativeSamples = self.__loadSamples(negativeFiles)

            totalDescriptors = []
            self.__addDescriptors(totalDescriptors, positiveSamples)
            self.__addDescriptors(totalDescriptors, negativeSamples)

            self.__kmeans.fit(totalDescriptors)
            clusters = self.__kmeans.predict(totalDescriptors)

            self.__printDistribution(clusters)
            self.__saveCache((positiveSamples, negativeSamples, self.__kmeans, clusters))
        else:
            self.__log('using cache')
            positiveSamples, negativeSamples, self.__kmeans, clusters = cachedData

        totalSamplesNumber = len(negativeSamples) + len(positiveSamples)
        counts = lil_matrix((totalSamplesNumber, self.__clustersNumber))
        counts1 = lil_matrix((totalSamplesNumber, 256))
        self.__currentSample = 0
        self.__currentDescr = 0
        self.__calculteCounts(positiveSamples, counts, counts1, clusters)
        self.__calculteCounts(negativeSamples, counts, counts1, clusters)
        counts = csr_matrix(counts)
        counts1 = csr_matrix(counts1)

        self.__log('training bayes classifier')
        tfidf = self.__tfidf.fit_transform(counts)
        tfidf1 = self.__tfidf1.fit_transform(counts1)
        classes = [True] * len(positiveSamples) + [False] * len(negativeSamples)
        self.__clf.fit(tfidf, classes)
        self.__clf1.fit(tfidf1, classes)

        self.__log('training complete')

    def predict(self, files):
        self.__log('loading files')
        samples = self.__loadSamples(files)
        totalDescriptors = []
        self.__addDescriptors(totalDescriptors, samples)
        self.__log('predicting classes')
        clusters = self.__kmeans.predict(totalDescriptors)
        counts = lil_matrix((len(samples), self.__clustersNumber))
        counts1 = lil_matrix((len(samples), 256))
        self.__currentSample = 0
        self.__currentDescr = 0
        self.__calculteCounts(samples, counts, counts1, clusters)
        counts = csr_matrix(counts)
        counts1 = csr_matrix(counts1)

        tfidf = self.__tfidf.transform(counts)
        tfidf1 = self.__tfidf1.transform(counts1)

        self.__log('classifying')

        weights = self.__clf.predict_log_proba(tfidf.toarray())
        weights1 = self.__clf1.predict_log_proba(tfidf1.toarray())
        predictions = []
        for i in range(0, len(weights)):
            w = weights[i][0] - weights[i][1]
            w1 = weights1[i][0] - weights1[i][1]

            pred = w < 0
            pred1 = w1 < 0

            if pred != pred1:
                pred = w + w1 < 0

            predictions.append(pred)

        self.__log('prediction complete')
        return predictions

    def saveModel(self, fileName):
        data = pickle.dumps((self.__clustersNumber, self.__kmeans, self.__tfidf,
                             self.__tfidf1, self.__clf, self.__clf1), -1)
        data = zlib.compress(data)
        open(fileName, 'wb').write(data)

    def loadModel(self, fileName):
        data = open(fileName, 'rb').read()
        data = zlib.decompress(data)
        data = pickle.loads(data)
        self.__clustersNumber, self.__kmeans, self.__tfidf, self.__tfidf1, self.__clf, self.__clf1 = data

    def __log(self, message):
        if self.__verbose:
            print(message)

    def __saveCache(self, data):
        if not self.__useCache:
            return
        data = pickle.dumps(data, -1)
        data = zlib.compress(data)
        open('cache.bin', 'w').write(data)

    def __loadCache(self):
        if not self.__useCache:
            return None
        if not os.path.isfile('cache.bin'):
            return None
        data = open('cache.bin', 'r').read()
        data = zlib.decompress(data)
        data = pickle.loads(data)
        return data

    def __calculteCounts(self, samples, counts, counts1, clusters):
        cn = self.__clustersNumber
        for s in samples:
            currentCounts = {}
            for d in s[0]:
                currentCounts[clusters[self.__currentDescr]] = currentCounts.get(clusters[self.__currentDescr], 0) + 1
                self.__currentDescr += 1
            for clu, cnt in currentCounts.iteritems():
                counts[self.__currentSample, clu] = cnt
            for i, histCnt in enumerate(s[1]):
                counts1[self.__currentSample, i] = histCnt[0]
            self.__currentSample += 1

    def __printDistribution(self, clusters):
        if not self.__verbose:
            return
        distr = {}
        for c in clusters:
            distr[c] = distr.get(c, 0) + 1
        v = sorted(distr.values(), reverse=True)
        print('distribution:', v[0:15], '...', v[-15:])

    def __addDescriptors(self, totalDescriptors, samples):
        for sample in samples:
            for descriptor in sample[0]:
                totalDescriptors.append(descriptor)

    def __loadSamples(self, files):
        samples = [[]] * len(files)
        n = 0
        for f in files:
            self.__queue.put((self.__loadSingleSample, (f, samples, n)))
            n += 1
        self.__queue.join()
        if _g_removed:
            print(' === REMOVED = TERMINATE')
            sys.exit(44)
        return samples

    def __loadSingleSample(self, args):
        global _g_removed
        fileName, samples, sampleNum = args
        des, hist = self.__getFeatures(fileName)
        if des is None:
            print('ERROR: failed to load', fileName)
            os.remove(fileName)
            _g_removed = True
            # sys.exit(44)
            des = []
            hist = [[0]] * 256
        samples[sampleNum] = (des, hist)

    def __getFeatures(self, fileName):
        fid = 'cache/' + str(zlib.crc32(fileName))
        self.__log('loading %s' % fileName)
        if os.path.isfile(fid):
            des, hist = pickle.loads(open(fid, 'rb').read())
        else:
            img = cv2.imread(fileName)

            if img.shape[1] > 1000:
                cf = 1000.0 / img.shape[1]
                newSize = (int(cf * img.shape[0]), int(cf * img.shape[1]), img.shape[2])
                img.resize(newSize)

            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

            s = cv2.SIFT(nfeatures=400)

            d = cv2.DescriptorExtractor_create("OpponentSIFT")
            kp = s.detect(gray, None)
            kp, des = d.compute(img, kp)

            hist = self.__getColorHist(img)

            #open(fid, 'wb').write(pickle.dumps((des, hist), -1))

        return des, hist

    def __getColorHist(self, img):
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        dist = cv2.calcHist([hsv], [0], None, [256], [0, 256])
        return dist
예제 #8
0
confusionmatrix=confusion_matrix(ypred,ytest)
print(confusionmatrix)
rmse=math.sqrt(mean_squared_error(ypred,ytest))
print(rmse)
plt.plot(ypred)
plt.show()
from sklearn.ensemble import AdaBoostClassifier
adc=AdaBoostClassifier(random_state=0,learning_rate=1.0)
print(adc.fit(xtrain,ytrain))
ypred=adc.predict(xtest)
ypred1=adc.predict(xtrain)
print(ypred)
print(list(le.inverse_transform(ypred)))
print(classification_report(ypred,ytest))
print(adc.predict_proba(xtest))
print(adc.predict_log_proba(xtest))
print(accuracy_score(ytest,ypred))
print(accuracy_score(ytrain,ypred1))
import xgboost as xgb
import lightgbm as lgb 
from xgboost import plot_importance 
xgb1=xgb.XGBClassifier(booster='gbtree',n_jobs=-1,n_estimators=500,max_depth=0,learning_rate=0.3,random_state=14,max_leaves=5,grow_policy="lossguide")
print(xgb1.fit(xtrain,ytrain))
ypred=xgb1.predict(xtest)
ypred1=xgb1.predict(xtrain)
print(ypred)
print(xgb1.predict_proba(xtest))
print(list(le.inverse_transform(ypred)))
print(accuracy_score(ytest,ypred))
print(accuracy_score(ytrain,ypred1))
lgb1=lgb.LGBMClassifier(boosting_type="gbdt",num_leaves=5,n_estimators=500,n_jobs=-1,learning_rate=0.3,max_depth=0,random_state=14)