Пример #1
0
    def train(self, train_data, train_label):
        if self.norm_type == "Standardization":
            train_data = preProcess.Standardization(train_data)
        else:
            train_data = preProcess.Normalization(train_data)

        skf = StratifiedKFold(self.n_folds)
        prediction_feature = np.zeros(
            (train_data.shape[0], len(self.classifier_set)))
        trained_model = []

        # the first layer in Stacking
        for j, clf in enumerate(self.classifier_set):
            # train each submodel
            subtrained_model = []
            # cross validation
            for (train_index, test_index) in skf.split(train_data,
                                                       train_label):
                X_train, X_test = train_data[train_index], train_data[
                    test_index]
                y_train, y_test = train_label[train_index], train_label[
                    test_index]
                # train and save the model trained with S-si
                clf.train(X_train, y_train)
                subtrained_model.append(clf)
                # get the prediction feature for each sub model
                prediction_feature[test_index, j] = clf.predict(X_test)[:, 0]
            # save the models
            trained_model.append(subtrained_model)

        self.trained_classifier_set = trained_model
        return self
    def train(self, train_data, train_label, method="GA", alpha=0.1, iterations=100):
        if self.norm_type == "Standardization":
            train_data = preProcess.Standardization(train_data)
        else:
            train_data = preProcess.Normalization(train_data)

        train_label = np.expand_dims(train_label, axis=1)
        feature_dim = len(train_data[1])


        if method == "GA":
            weights = np.random.normal(0, 1, [feature_dim, 1])
            for i in range(iterations):
                pred = self.sigmoid(np.dot(train_data, weights))
                errors = train_label - pred
                # update the weights
                weights = weights + alpha * np.dot(train_data.T, errors)
            self.weights = weights
            return self

        if method == "SGA":
            weights = np.random.normal(0, 1, feature_dim)
            sample_num = len(train_data)
            random_index = np.random.randint(sample_num, size=sample_num)
            for i in range(iterations):
                for j in range(sample_num):
                    alpha = self.updataAlpha(alpha, i, 1)
                    pred = self.sigmoid(np.dot(train_data[random_index[j], :], weights))
                    sample_error = train_label[random_index[j]] - pred
                    weights = weights + alpha * sample_error * train_data[random_index[j], :]

            self.weights = weights
            return self
Пример #3
0
    def predict(self, test_data, prob="False"):
        # Normalization
        if self.norm_type == "Standardization":
            test_data = preProcess.Standardization(test_data)
        else:
            test_data = preProcess.Normalization(test_data)

        test_num = test_data.shape[0]
        prediction = np.zeros([test_num, 1])
        probability = np.zeros([test_num, 1])
        for i in range(test_num):

            inter_1 = test_data[i] * self.V
            inter_2 = np.multiply(test_data[i], test_data[i]) * np.multiply(self.V, self.V)
            interaction = sum(np.multiply(inter_1, inter_1) - inter_2) / 2.
            pre = self.w_0 + test_data[i] * self.W + interaction
            probability = self.sigmoid(float(pre))

            if probability[i] > 0.5:
                prediction[i] = 1
            else:
                prediction[i] = 0.5

        self.prediction = prediction
        self.probability = probability
        if prob:
            return probability
        else:
            return prediction
Пример #4
0
    def predict(self, test_data, prob="False"):
        # Normalization
        if self.norm_type == "Standardization":
            test_data = preProcess.Standardization(test_data)
        else:
            test_data = preProcess.Normalization(test_data)

        test_num = test_data.shape[0]
        prediction = np.zeros([test_num, 1])
        probability = np.zeros([test_num, 1])

        # find the support vectors and its corresponding label
        support_vectors_index = np.nonzero(self.alphas > 0)[0]
        support_vectors = self.train_data[support_vectors_index]
        support_vectors_label = self.train_label[support_vectors_index]
        support_vectors_alphas = self.alphas[support_vectors_index]

        # predict the test sample in page of 122 Eq.(7.89)
        for i in range(test_num):
            kernel_data = self.kernelTransformation(support_vectors, test_data[i, :], self.kernel)
            probability[i] = np.dot(kernel_data.T, np.multiply(support_vectors_label, support_vectors_alphas)) + self.b
            if probability[i] > 0:
                prediction[i] = 1
            else:
                prediction[i] = -1

        self.prediction = prediction
        self.probability = probability
        if prob:
            return probability
        else:
            return prediction
Пример #5
0
    def train(self, train_data, train_label):
        if self.norm_type == "Standardization":
            train_data = preProcess.Standardization(train_data)
        else:
            train_data = preProcess.Normalization(train_data)

        # initiation
        sample_num, feature_dim = np.shape(train_data)
        self.train_data = train_data
        self.train_label = self.labelTransformation(train_label)
        self.sample_num = sample_num
        self.K = np.zeros([self.sample_num, self.sample_num])
        self.alphas = np.zeros([self.sample_num, 1])
        self.errors = np.zeros([self.sample_num, 2])
        self.b = 0

        # kernel trick
        for i in range(self.sample_num):
            self.K[:, i] = self.kernelTransformation(self.train_data,
                                                     self.train_data[i, :],
                                                     self.kernel)

        # train model
        self.SMO()
        return self
Пример #6
0
    def train(self, train_data, train_label, alpha=0.01, iterations=100):
        if self.norm_type == "Standardization":
            train_data = preProcess.Standardization(train_data)
        else:
            train_data = preProcess.Normalization(train_data)

        for epoch in range(iterations):
            for id in range(self.sample_num):

                # second order computation
                inter_1 = train_data[id] * self.V
                inter_2 = np.multiply(train_data[id], train_data[id]) * np.multiply(self.V, self.V)
                interaction = np.sum(np.multiply(inter_1, inter_1) - inter_2) / 2.

                # prediction result
                pred = self.w_0 + train_data[id] * self.W + interaction

                # calculate loss, cross entropy
                base = [np.log(self.sigmoid(train_label[id] * float(pred))) - 1] * train_label

                # update numerical parameters
                self.w_0 -= alpha * base

                x = train_data[id]
                for i in range(self.n):
                    # update first-order parameter
                    if train_data[id, i] != 0:
                        self.W[id, i] -= alpha * base  * train_data[id, i]
                        for j in range(self.n):
                            # update second-order parameter
                            self.V[i, j] -= alpha * base * (
                                    train_data[id, i] * self.V[j, i] * train_data[id, j] - self.V[i, j] * train_data[id, i] * train_data[id, i])

        return self
Пример #7
0
    def train(self, train_data, train_label):
        if self.norm_type == "Standardization":
            train_data = preProcess.Standardization(train_data)
        else:
            train_data = preProcess.Normalization(train_data)

        train_data1, train_data2, train_label1, train_label2 = train_test_split(
            train_data, train_label, test_size=0.5, random_state=2019)
        # train set in the second layer
        train_predict_feature = np.zeros((train_data2.shape[0], self.k))
        trained_model = []

        # the first layer in Blending
        for j, clf in enumerate(self.classifier_set):
            # train each submodel
            print(j, clf)
            clf.train(train_data1, train_label1)
            train_predict_feature[:, j] = clf.predict(train_data2)[:, 0]
            # save the trained model in the first layer
            trained_model.append(clf)

        # the second layer in Blending
        layer2_clf = PerceptronClassifier()
        layer2_clf.train(train_predict_feature, train_label2)

        self.layer1_classifier_set = trained_model
        self.layer2_classifier = layer2_clf

        return self
Пример #8
0
    def predict(self, test_data, prob="False"):
        # Normalization
        if self.norm_type == "Standardization":
            test_data = preProcess.Standardization(test_data)
        else:
            test_data = preProcess.Normalization(test_data)

        test_num = test_data.shape[0]
        prediction = np.zeros([test_num, 1])
        probability = np.zeros([test_num, 1])
        for i in range(test_num):
            probability[i] = self.sigmoid(
                np.dot(self.w.T, test_data[i, :]) + self.b
            )  # prediction = self.sigmoid(np.dot(self.w.T, test_data) + self.b) can speed up
            if probability[i] > 0:
                prediction[i] = 1
            else:
                prediction[i] = -1

        self.prediction = prediction
        self.probability = probability
        if prob:
            return probability
        else:
            return prediction
Пример #9
0
 def train(self, train_data, train_label):
     if self.norm_type == "Standardization":
         train_data = preProcess.Standardization(train_data)
     else:
         train_data = preProcess.Normalization(train_data)
     self.x_train = train_data
     self.y_train = train_label
     return self
Пример #10
0
    def cluster(self, train_data, display="True"):
        if self.norm_type == "Standardization":
            train_data = preProcess.Standardization(train_data)
        else:
            train_data = preProcess.Normalization(train_data)

        sample_num = len(train_data)
        label = -np.ones([sample_num])
        center_flag = np.zeros([sample_num
                                ])  # indicate samples which are cluster center
        center_index = []  # cluster center index
        C = 0

        # start clustering
        for i in range(sample_num):
            temp_neighbor = []
            if label[i] != -1:  # if sample i has been clustered, stop this iteration
                continue
            index1, neighbor1 = self.neighborQuery(train_data,
                                                   train_data[i, :])
            if len(neighbor1) < self.m:
                label[i] = 0  # sample i is noise point
                continue  # stop this iteration

            C = C + 1
            label[i] = C
            center_flag[i] = 1
            temp_neighbor.append(neighbor1)

            # all the samples in i-th neighbor belong to label C
            center_index = index1
            j = 0
            while j < len(center_index):
                loc = center_index[j]
                #print(center_index[j])
                if center_flag[
                        j] != 1:  # samples in i-th sample's neighbor but not cluster center
                    if label[loc] == 0:
                        label[loc] = C
                    if label[loc] != -1:
                        j = j + 1
                        continue
                    label[loc] = C
                    index2, neighbor2 = self.neighborQuery(
                        train_data, train_data[loc, :])
                    if len(neighbor2) >= self.m:
                        center_index = list(
                            set(index1).union(set(index2))
                        )  # merge the cluster centers which are directly reachable
                        center_flag[
                            loc] = 1  # if there are more than m samples in j-th neighbor, add j-th sample as a cluster center
                j = j + 1
        self.label = label

        if display:
            self.plotResult(train_data)
        return label
Пример #11
0
    def predict(self, x, prob="False"):
        # Normalization
        if self.norm_type == "Standardization":
            x = preProcess.Standardization(x)
        else:
            x = preProcess.Normalization(x)

        y = np.dot(x, self.w)
        self.prediction = y
        return y
Пример #12
0
    def standardLinearRegression(self, x, y):
        if self.norm_type == "Standardization":
            x = preProcess.Standardization(x)
        else:
            x = preProcess.Normalization(x)

        xTx = np.dot(x.T, x)
        if np.linalg.det(xTx) == 0:   # calculate the Determinant of xTx
            print("Error: Singluar Matrix !")
            return
        w = np.dot(np.linalg.inv(xTx), np.dot(x.T, y))
        return w
Пример #13
0
    def lassoRegression(self, x, y):
        if self.norm_type == "Standardization":
            x = preProcess.Standardization(x)
        else:
            x = preProcess.Normalization(x)

        sample_num, feataure_dim = np.shape(x)
        w = np.zeros([feataure_dim, 1])
        for i in range(self.iterations):
            last_w = w
            w[i] = np.dot(x[i, :], (y[i] - x[i, :] * last_w.T))/np.dot(x[i, :], x[i, :].T)
        return w
Пример #14
0
    def train(self, trainData, trainLabel):
        if self.norm_type == "Standardization":
            trainData = preProcess.Standardization(trainData)
        else:
            trainData = preProcess.Normalization(trainData)

        trainLabel = np.expand_dims(trainLabel, axis=1)
        data = np.hstack([trainData, trainLabel])

        self.tree_node = self.createDecisionTree(data)
        #self.printTree(self.tree_node)
        return self
Пример #15
0
    def train(self, data, label):
        # Normalization
        if self.norm_type == "Standardization":
            data = preProcess.Standardization(data)
        else:
            data = preProcess.Normalization(data)
        unique_label = np.unique(label)
        mu = np.mean(data, axis=0)
        # St = np.dot((data - mu).T, data - mu)

        Sw = 0
        Sb = 0
        for c in unique_label:
            index = np.where(label == c)
            Ni = len(index)
            xi = data[index]
            mui = np.mean(xi, axis=0)

            # calculate Sw
            Si = np.dot((xi - mui).T, xi - mui)
            Sw = Sw + Si

            # calculate Sb
            delta = np.expand_dims(mu - mui, axis=1)
            Sb = Sb + Ni * np.dot(delta, delta.T)

        # calculate the eigenvalue, eigenvector of Sw-1 * Sb
        temp = np.dot(np.linalg.inv(Sw), Sb)
        eigenvalue, eigenvector = np.linalg.eig(np.dot(np.linalg.inv(Sw), Sb))

        index = np.argsort(-eigenvalue)
        eigenvalue = eigenvalue[index]
        eigenvector = eigenvector[:, index]
        # calculate contribute rate
        contribute_rate = np.zeros(len(index))
        acc_contribute_rate = np.zeros(len(index))
        value_sum = eigenvalue.sum()
        sum = 0
        k = 0
        for i in range(len(eigenvalue)):
            sum = sum + eigenvalue[i]
            contribute_rate[i] = eigenvalue[i] / value_sum
            acc_contribute_rate[i] = sum / value_sum
            if (acc_contribute_rate[i - 1] <
                    self.rate) and (acc_contribute_rate[i] >= self.rate):
                k = i

        self.contribute_rate = contribute_rate
        self.acc_contribute_rate = acc_contribute_rate

        matrix = np.mat(eigenvector)[:, k]
        self.matrix = matrix
        return self
Пример #16
0
    def train(self, train_data, train_label):
        if self.norm_type == "Standardization":
            train_data = preProcess.Standardization(train_data)
        else:
            train_data = preProcess.Normalization(train_data)

        label_count = {}
        feature_dim = len(train_data[1])

        # get the number of each labels
        for c in train_label:
            label_count[c] = label_count.get(c, 0) + 1
        label_value = sorted(label_count.items(), key=op.itemgetter(0), reverse=False)
        self.label_value = label_value

        K = len(label_value)         # the number of unique labels
        N = len(train_label)         # the number of samples

        # get the prior probability
        prior_probability = {}
        for key in range(len(label_value)):
            prior_probability[label_value[key][0]] = (label_value[key][1] + self.laplace) / (N + K * self.laplace)  # laplace smooth
        self.prior_probability = prior_probability

        # get the value set of each feature
        feature_value = []  # feature with different value
        S = []  # the number of unique values of each feature
        for feat in range(feature_dim):
            unique_feature = np.unique(train_data[:, feat])
            S.append(len(unique_feature))
            feature_value.append(unique_feature)
        self.S = S
        self.feature_value = feature_value

        # calculate the conditional probability
        prob = []
        # calculate the count (x = a & y = c)
        for j in range(feature_dim):
            count = np.zeros([S[j], len(label_count)])  # the range of label start with 1
            feature_temp = train_data[:, j]
            feature_value_temp = feature_value[j]
            for i in range(len(feature_temp)):
                for k in range(len(feature_value_temp)):
                    for t in range(len(label_count)):
                        if feature_temp[i] == feature_value_temp[k] and train_label[i] == label_value[t][0]:
                            count[k][t] += 1             # x = value and y = label
            # calculate the conditional probability
            for m in range(len(label_value)):
                count[:, m] = (count[:, m] + self.laplace) / (label_value[m][1] + self.laplace*S[j])  # laplace smoothing
            # print(count)
            prob.append(count)
        self.conditional_probability = prob
        return self
Пример #17
0
    def ridgeRegression(self, x, y):
        if self.norm_type == "Standardization":
            x = preProcess.Standardization(x)
        else:
            x = preProcess.Normalization(x)

        feature_dim = len(x[0])
        xTx = np.dot(x.T, x)
        matrix = xTx + np.exp(feature_dim)*self.lamda
        if np.linalg.det(xTx) == 0:
            print("Error: Singluar Matrix !")
            return
        w = np.dot(np.linalg.inv(matrix), np.dot(x.T, y))
        return w
Пример #18
0
    def LWLinearRegression(self, x, y, sample):
        if self.norm_type == "Standardization":
            x = preProcess.Standardization(x)
        else:
            x = preProcess.Normalization(x)

        sample_num = len(x)
        weights = np.eye(sample_num)
        for i in range(sample_num):
            diff = sample - x[i, :]
            weights[i, i] = np.exp(np.dot(diff, diff.T)/(-2 * self.k ** 2))
        xTx = np.dot(x.T, np.dot(weights, x))
        if np.linalg.det(xTx) == 0:
            print("Error: Singluar Matrix !")
            return
        result = np.dot(np.linalg.inv(xTx), np.dot(x.T, np.dot(weights, y)))
        return result
Пример #19
0
    def cluster(self, train_data, display="True"):
        if self.norm_type == "Standardization":
            train_data = preProcess.Standardization(train_data)
        else:
            train_data = preProcess.Normalization(train_data)

        if self.cluster_type == "KMeans":
            self.centers, self.distances = self.kmeans(train_data, self.k)
        elif self.cluster_type == "biKMeans":
            self.centers, self.distances = self.biKmeans(train_data)
        elif self.cluster_type == "KMeans++":
            self.centers, self.distances = self.kmeansplusplus(train_data)
        else:
            print("Wrong cluster type!")
            sys.exit()
        if display:
            self.plotResult(train_data)
        return self.distances[:, 0]
Пример #20
0
    def predict(self, test_data):
        # Normalization
        if self.norm_type == "Standardization":
            testData = preProcess.Standardization(test_data)
        else:
            testData = preProcess.Normalization(test_data)

        test_num = testData.shape[0]
        prediction = np.zeros([test_num, 1])
        probability = np.zeros([test_num, 1])
        # predict each samples in test data
        for i in range(test_num):
            prediction[i], probability[i] = self.calculateDistance(
                testData[i], self.x_train, self.y_train, self.k)

        self.prediction = prediction
        self.probability = probability

        return prediction
Пример #21
0
    def train(self, train_data, train_label):
        if self.norm_type == "Standardization":
            train_data = preProcess.Standardization(train_data)
        else:
            train_data = preProcess.Normalization(train_data)

        if self.regression_type == "Standard":
            self.w = self.standardLinearRegression(train_data, train_label)
        elif self.regression_type == "Localweight":
            self.w = self.LWLinearRegression(train_data, train_label)
        elif self.regression_type == "Ridge":
            self.w = self.ridgeRegression(train_data, train_label)
        elif self.regression_type == "Lasso":
            self.w = self.lassoRegression(train_data, train_label)
        elif self.regression_type == "Forwardstep":
            self.w = self.forwardstepRegression(train_data, train_label)
        else:
            print("Error Regression Type!")
        return self
Пример #22
0
    def train(self, train_data, train_label):
        if self.norm_type == "Standardization":
            train_data = preProcess.Standardization(train_data)
        else:
            train_data = preProcess.Normalization(train_data)

        train_label = np.expand_dims(train_label, axis=1)
        sample_num = len(train_data)

        weak_classifier = []

        # initialize weights
        w = np.ones([sample_num, 1])
        w = w / sample_num

        # predictions
        agg_predicts = np.zeros([sample_num,
                                 1])  # aggregate value of prediction

        # start train
        for i in range(self.iterations):
            base_clf, error, base_prediction = self.baseClassifier(
                train_data, train_label, w)
            alpha = self.updateAlpha(error)
            weak_classifier.append((alpha, base_clf))

            # update parameters in page of 139 Eq.(8.4)
            expon = np.multiply(-1 * alpha * train_label, base_prediction)
            w = np.multiply(w, np.exp(expon))
            w = w / w.sum()

            # calculate the total error rate
            agg_predicts += alpha * base_prediction
            error_rate = np.multiply(
                np.sign(agg_predicts) != train_label, np.ones([sample_num, 1]))
            error_rate = error_rate.sum() / sample_num

            if error_rate == 0:
                break
            self.classifier_set = weak_classifier
        return weak_classifier
Пример #23
0
    def predict(self, test_data, prob="False"):
        # Normalization
        if self.norm_type == "Standardization":
            test_data = preProcess.Standardization(test_data)
        else:
            test_data = preProcess.Normalization(test_data)

        test_num = test_data.shape[0]
        prediction = np.zeros([test_num, 1])
        probability = np.zeros([test_num, 1])
        for i in range(test_num):
            result = self.classify(test_data[i, :], self.tree_node)
            result = sorted(result.items(), key=op.itemgetter(1), reverse=True)
            prediction[i] = result[0][0]
            #probability[i] = result[0][1]/(result[0][1] + result[1][1])
        self.prediction = prediction
        self.probability = probability
        if prob:
            return probability
        else:
            return prediction
Пример #24
0
    def predict(self, test_data, prob="False"):
        # Normalization
        if self.norm_type == "Standardization":
            test_data = preProcess.Standardization(test_data)
        else:
            test_data = preProcess.Normalization(test_data)

        test_predict_feature = np.zeros((test_data.shape[0], self.k))
        # the first layer in Blending
        for j, clf in enumerate(self.layer1_classifier_set):
            test_predict_feature[:, j] = clf.predict(test_data)[:, 0]

        # the second layer in Blending
        probability = self.layer2_classifier.predict(test_predict_feature)
        prediction = (probability > 0.5) * 1

        self.probability = probability
        self.prediction = prediction
        if prob:
            return probability
        else:
            return prediction
Пример #25
0
    def forwardstepRegression(self, x, y):
        if self.norm_type == "Standardization":
            x = preProcess.Standardization(x)
        else:
            x = preProcess.Normalization(x)

        sample_num, feature_dim = np.shape(x)
        w = np.zeros([self.iterations, feature_dim])
        best_w = np.zeros([feature_dim, 1])
        for i in range(self.iterations):
            min_error = np.inf
            for j in range(feature_dim):
                for sign in [-1, 1]:
                    temp_w = best_w
                    temp_w[j] += sign * self.learning_rate
                    y_hat = np.dot(x, temp_w)
                    error = ((y - y_hat) ** 2).sum()                # MSE
                    if error < min_error:                           # save the best parameters
                        min_error = error
                        best_w = temp_w
            w[i, :] = best_w.T
        return w
Пример #26
0
    def predict(self, test_data, prob="False"):
        # Normalization
        if self.norm_type == "Standardization":
            test_data = preProcess.Standardization(test_data)
        else:
            test_data = preProcess.Normalization(test_data)

        test_num = test_data.shape[0]
        prediction = np.zeros([test_num, 1])
        probability = np.zeros([test_num, 1])

        for classifier in self.classifier_set:
            alpha = classifier[0]
            clf = classifier[1]
            base_prediction = alpha * clf.predict(test_data)
            probability += base_prediction

        self.prediction = np.sign(probability)
        self.probability = probability
        if prob:
            return probability
        else:
            return prediction
Пример #27
0
    def predict(self, test_data, prob="False"):
        # Normalization
        if self.norm_type == "Standardization":
            test_data = preProcess.Standardization(test_data)
        else:
            test_data = preProcess.Normalization(test_data)

        pre_prediction = np.zeros((test_data.shape[0], self.n_folds))
        # the first layer in Stacking
        for j, sub_model in enumerate(self.trained_classifier_set):
            sub_prediction_feature = np.zeros(
                (test_data.shape[0], self.n_folds))
            i = 0
            for clf in sub_model:
                sub_prediction_feature[:, i] = clf.predict(test_data)[:, 0]
                i = i + 1
            pre_prediction[:, j] = sub_prediction_feature.mean(1)

        test_num = test_data.shape[0]
        prediction = np.zeros([test_num, 1])
        probability = np.zeros([test_num, 1])
        # the second layer in Stacking
        if self.fusion_type == "Averaging":
            probability = pre_prediction.mean(1)
        elif self.fusion_type == "Voting":
            probability = np.sum(pre_prediction, axis=1) / self.k
        elif self.fusion_type == "Weighing":
            w = [i / i.sum() for i in pre_prediction]
            probability = np.sum(np.multiply(pre_prediction, w), axis=1)

        prediction = (probability > 0.5) * 1
        self.probability = probability
        self.prediction = prediction
        if prob:
            return probability
        else:
            return prediction
Пример #28
0
    def train(self, train_data, train_label):
        if self.norm_type == "Standardization":
            train_data = preProcess.Standardization(train_data)
        else:
            train_data = preProcess.Normalization(train_data)

        feature_dim = len(train_data[1])
        train_label = np.expand_dims(train_label, axis=1)
        self.initializeParameter(feature_dim)

        self.loss = []
        # training process
        for i in range(self.iterations):
            gradients, cost = self.backPropagate(train_data, train_label)
            # get the derivative
            dw = gradients["dw"]
            db = gradients["db"]

            # update parameter
            self.w = self.w - self.learning_rate * dw
            self.b = self.b - self.learning_rate * db
            self.loss.append(cost)

        return self