示例#1
0
    def predict(self, data):
        res = []
        if len(self.predictors) == 0:
            logging.debug('The Model has not been fitted!')
        if len(data.shape) == 1:
            data = np.reshape(data, [1, -1])

        for i in data:
            predict_res = self._use_predictors(i)

            if self.predicted_vector == []:
                self.predicted_vector = copy.deepcopy(predict_res)
            else:
                self.predicted_vector = np.row_stack(
                    (self.predicted_vector, predict_res))

            value = MT.closet_vector(predict_res, self.matrix,
                                     y_euclidean_distance,
                                     np.array(self.predictor_weights))
            res.append(MT.get_key(self.index, value))

        vector = []
        for i in range(self.matrix.shape[1]):
            vector.append(list(self.predicted_vector[:, i]))
        self.predicted_vector = copy.deepcopy(vector)

        return np.array(res)
示例#2
0
    def predict(self, data):
        """
        a method used to predict label for give data
        :param data: data to predict
        :return: predicted label
        """
        res = []
        if len(self.predictors) == 0:
            logging.debug('The Model has not been fitted!')
        if len(data.shape) == 1:
            data = np.reshape(data, [1, -1])
        for i in data:
            # find k neighbors from train data
            knn_model = neighbors.KNeighborsClassifier(algorithm='ball_tree',
                                                       n_neighbors=3).fit(
                                                           self.train_data,
                                                           self.train_label)
            knn_pre_label = knn_model.predict([i])
            predicted_vector = self._use_predictors(i)  # one row
            index = {l: i for i, l in enumerate(np.unique(self.train_label))}
            knn_pre_index = index[knn_pre_label[0]]
            # make it 0 when the knn predicted class is 0
            for j in range(len(self.matrix[0])):
                if self.matrix[knn_pre_index][j] == 0:
                    predicted_vector[j] = 0

            self.predicted_vector.append(list(predicted_vector))
            value = MT.closet_vector(predicted_vector, self.matrix,
                                     self.distance_measure)
            res.append(MT.get_key(self.index, value))
        return np.array(res)
示例#3
0
 def create_matrix(self, data, label):
     while True:
         index = {l: i for i, l in enumerate(np.unique(label))}
         matrix_row = len(index)
         if matrix_row > 3:
             matrix_col = np.int(np.floor(10 * np.log10(matrix_row)))
         else:
             matrix_col = matrix_row
         matrix = np.random.random((matrix_row, matrix_col))
         class_1_index = matrix > 0.5
         class_2_index = matrix < 0.5
         matrix[class_1_index] = 1
         matrix[class_2_index] = -1
         if (not MT.exist_same_col(matrix)) and (not MT.exist_same_row(
                 matrix)) and MT.exist_two_class(matrix):
             return matrix, index
示例#4
0
    def create_matrix(self, data, label, **param):
        labels_to_divide = [np.unique(label)]
        index = {l: i for i, l in enumerate(np.unique(label))}

        TM = None
        DCECOC = DC_ECOC()
        if 'dc_option' in param:
            for each in param['dc_option']:
                m, index = DCECOC.create_matrix(data, label, dc_option=each)
                if M is None:
                    M = [m]
                else:
                    M = M.append(m)
        else:

            logging.debug('ERROR: undefine the type of DCECOC')
            return

        train_data, train_label, val_data, val_label = MT.split_traindata(
            data, label)  # split data into train and validation

        # select the most effective matrix
        res = np.zeros(1, len(M))
        for i in range(len(M)):
            m = M[i]
            res[i] = MT.res_matrix(m, index, train_data, train_label, val_data,
                                   val_label, self.estimator,
                                   self.distance_measure)
        best_M = M[res.index(max(res))]

        most_time = 10
        res = 1
        while (most_time and res < 0.8):

            sel_m = random.random(len(M))
            new_M, new_index = MT.change_subtree(best_M, M[sel_m])
            new_res = MT.res_matrix(new_M, new_index, train_data, train_label,
                                    val_data, val_label, self.estimator,
                                    self.distance_measure)
            if new_res > res:
                best_M = new_M
                res = new_res

            most_time = most_time + 1

        return M, index
示例#5
0
 def create_matrix(self, data, label):
     index = {l: i for i, l in enumerate(np.unique(label))}
     matrix = None
     labels_to_agg = np.unique(label)
     labels_to_agg_list = [[x] for x in labels_to_agg]
     label_dict = {
         labels_to_agg[value]: value
         for value in range(labels_to_agg.shape[0])
     }
     num_of_length = len(labels_to_agg_list)
     class_1_variety = []
     class_2_variety = []
     while len(labels_to_agg_list) > 1:
         score_result = np.inf
         for i in range(0, len(labels_to_agg_list) - 1):
             for j in range(i + 1, len(labels_to_agg_list)):
                 class_1_data, class_1_label = MT.get_data_subset(
                     data, label, labels_to_agg_list[i])
                 class_2_data, class_2_label = MT.get_data_subset(
                     data, label, labels_to_agg_list[j])
                 score = Criterion.agg_score(
                     class_1_data,
                     class_1_label,
                     class_2_data,
                     class_2_label,
                     score=Criterion.max_distance_score)
                 if score < score_result:
                     score_result = score
                     class_1_variety = labels_to_agg_list[i]
                     class_2_variety = labels_to_agg_list[j]
         new_col = np.zeros((num_of_length, 1))
         for i in class_1_variety:
             new_col[label_dict[i]] = 1
         for i in class_2_variety:
             new_col[label_dict[i]] = -1
         if matrix is None:
             matrix = new_col
         else:
             matrix = np.hstack((matrix, new_col))
         new_class = class_1_variety + class_2_variety
         labels_to_agg_list.remove(class_1_variety)
         labels_to_agg_list.remove(class_2_variety)
         labels_to_agg_list.insert(0, new_class)
     return matrix, index
示例#6
0
def get_DC_value(data, labels, group1, group2, dc_option):
    """
    
    :param data: the whole data
    :param labels: the whole labels
    :param group1: group1 classes 
    :param group2: group2 classes
    :param dc_option: select which dc is used
    :return: DC value
    """
    group1_data, group1_label = MT.get_data_subset(data, labels, group1)
    group2_data, group2_label = MT.get_data_subset(data, labels, group2)

    try:
        funname = 'get_complexity_' + dc_option
        fun = getattr(GC, funname)
        DC = fun(list(group1_data), list(group1_label), list(group2_data),
                 list(group2_label))
    except:
        logging.error('DC option is wrong')
        raise NameError('DC option is wrong')
    return DC
示例#7
0
    def fit(self, data, label, **estimator_param):
        """
        a method to train base estimator based on given data and label
        :param data: data used to train base estimator
        :param label: label corresponding to the data
        :param estimator_param: some param used by base estimator
        :return: None
        """
        self.train_data = data
        self.train_label = label
        self.predictors = []
        self.matrix, self.index = self.create_matrix(data, label)

        for i in range(self.matrix.shape[1]):
            dat, cla = MT.get_data_from_col(data, label, self.matrix[:, i],
                                            self.index)
            estimator = self.estimator(**estimator_param).fit(dat, cla)
            self.predictors.append(estimator)
示例#8
0
    def create_matrix(self, data, label, dc_option):
        labels_to_divide = [np.unique(label)]
        index = {l: i for i, l in enumerate(np.unique(label))}
        # if dc_option == 'F1':
        #     matrix = [[-1,0,1,0],[-1,0,-1,1],[1,-1,0,0],[-1,0,-1,-1],[1,1,0,0]]
        #     return matrix,index
        # elif dc_option == 'F2':
        #     matrix = [[-1,0,-1,-1],[-1,0,-1,1],[-1,0,1,0],[1,1,0,0],[1,-1,0,0]]
        #     return matrix,index
        # elif dc_option == 'F3':
        #     matrix = [[1,1,0,0],[-1,0,-1,1],[-1,0,1,0],[-1,0,-1,-1],[1,-1,0,0]]
        #     return matrix,index

        matrix = None
        while len(labels_to_divide) > 0:
            label_set = labels_to_divide.pop(0)

            # get correspoding label and data from whole data and label
            datas, labels = MT.get_data_subset(data, label, label_set)

            # DC search
            class_1, class_2 = Greedy_Search.greedy_search(datas,
                                                           labels,
                                                           dc_option=dc_option)
            new_col = np.zeros((len(index), 1))
            for i in class_1:
                new_col[index[i]] = 1
            for i in class_2:
                new_col[index[i]] = -1
            if matrix is None:
                matrix = copy.copy(new_col)
            else:
                matrix = np.hstack((matrix, new_col))
            if len(class_1) > 1:
                labels_to_divide.append(class_1)
            if len(class_2) > 1:
                labels_to_divide.append(class_2)
        return matrix, index
示例#9
0
 def create_matrix(self, data, label):
     index = {l: i for i, l in enumerate(np.unique(label))}
     matrix = None
     labels_to_divide = [np.unique(label)]
     while len(labels_to_divide) > 0:
         label_set = labels_to_divide.pop(0)
         datas, labels = MT.get_data_subset(data, label, label_set)
         class_1_variety_result, class_2_variety_result = sffs(
             datas, labels)
         new_col = np.zeros((len(index), 1))
         for i in class_1_variety_result:
             new_col[index[i]] = 1
         for i in class_2_variety_result:
             new_col[index[i]] = -1
         if matrix is None:
             matrix = copy.copy(new_col)
         else:
             matrix = np.hstack((matrix, new_col))
         if len(class_1_variety_result) > 1:
             labels_to_divide.append(class_1_variety_result)
         if len(class_2_variety_result) > 1:
             labels_to_divide.append(class_2_variety_result)
     return matrix, index
示例#10
0
    def create_matrix(self, data, label, **param):
        labels_to_divide = [np.unique(label)]
        index = {l: i for i, l in enumerate(np.unique(label))}

        M = None
        if 'base_M' not in param:
            DCECOC = DC_ECOC()
            if 'dc_option' in param:
                for each in param['dc_option']:
                    m, index = DCECOC.create_matrix(data,
                                                    label,
                                                    dc_option=each)
                    if M is None:
                        M = copy.deepcopy(m)
                    else:
                        M = np.hstack((M, m))
            else:
                logging.warning('use default DC: F1')
                M = DCECOC.create_matrix(data, label)
        else:
            for each in param['base_M']:
                if M is None:
                    M = copy.deepcopy(each)
                else:
                    M = np.hstack((M, each))

        if M is None:
            logging.debug('ERROR:Matrix is None')
            raise ValueError('ERROR:Matrix is None')

        # M = MT.remove_reverse(M)
        # M = MT.remove_duplicate_column(M) #simplify the process
        M = MT.select_column(M, data, label, len(index))

        if 'ternary_option' not in param:
            param['ternary_option'] = '+'

        logging.info("merged matrix:\r\n" + str(M))
        GPM = None
        while (len(M[0]) != 0):
            if len(M[0]) == 1:
                if GPM is None:
                    GPM = copy.copy(np.hstack((M)))
                else:
                    GPM = np.hstack((GPM, M))
                M = np.delete(M, 0, axis=1)
                break

            elif len(M[0]) == 2 or len(M[0]) == 3:
                left_node, right_node, M = MT.get_2column(M)
                parent_node = MT.left_right_create_parent(
                    left_node, right_node, param['ternary_option'], data,
                    label)
                M = np.hstack((M, parent_node))

                GPM = MT.insert_2column(GPM, left_node, right_node)

            elif len(M[0]) >= 4:
                left_left_node, left_right_node, M = MT.get_2column(M)
                left_parent_node = MT.left_right_create_parent(
                    left_left_node, left_right_node, param['ternary_option'],
                    data, label)
                GPM = MT.insert_2column(GPM, left_left_node, left_right_node)

                right_left_node, right_right_node, M = MT.get_2column(M)
                right_parent_node = MT.left_right_create_parent(
                    right_left_node, right_right_node, param['ternary_option'],
                    data, label)
                GPM = MT.insert_2column(GPM, right_left_node, right_right_node)

                M = np.hstack((M, left_parent_node, right_parent_node))

            # M = MT.change_unfit_DC(M,data,label,dc_option='D2')
        logging.info('1.create matrix ' + str(len(GPM[0])) + '\n' + str(GPM))

        GPM = MT.remove_reverse(GPM)  # delete reverse column and row
        logging.info('2.remove reverse matrix ' + str(len(GPM[0])) + '\n' +
                     str(GPM))

        GPM = MT.remove_duplicate_column(GPM)  # delete identical column
        logging.info('3.remove duplicate matrix ' + str(len(GPM[0])) + '\n' +
                     str(GPM))

        # GPM,new_index = MT.remove_duplicate_row(GPM,index) # delete identical row  ------may need!!
        GPM = MT.remove_unfit(
            GPM)  # delete column that does not contain +1 and -1
        logging.info('4.remove unfit matrix ' + str(len(GPM[0])) + '\n' +
                     str(GPM))

        return GPM, index
示例#11
0
 def fit(self, data, label):
     self.train_data, self.validate_data, self.train_label, self.validation_y = train_test_split(
         data, label, test_size=0.25)
     self.matrix, self.index, self.predictors, self.predictor_weights = \
         self.create_matrix(self.train_data, self.train_label, self.validate_data, self.validation_y, self.estimator,
                            **self.param)
     feature_subset = MT.get_subset_feature_from_matrix(
         self.matrix, self.index)
     for i in range(self.iter_num):
         y_pred = self.predict(self.validate_data)
         y_true = self.validation_y
         confusion_matrix = MT.create_confusion_matrix(
             y_true, y_pred, self.index)
         while True:
             max_index = np.argmax(confusion_matrix)
             max_index_y = np.floor(max_index / confusion_matrix.shape[1])
             max_index_x = max_index % confusion_matrix.shape[1]
             label_y = MT.get_key(self.index, max_index_y)
             label_x = MT.get_key(self.index, max_index_x)
             score_result = 0
             col_result = None
             est_result = None
             est_weight_result = None
             feature_subset_m = None
             feature_subset_n = None
             for m in range(len(feature_subset) - 1):
                 for n in range(m + 1, len(feature_subset)):
                     if ((label_y in feature_subset[m] and label_x in feature_subset[n])
                         or (label_y in feature_subset[n] and label_x in feature_subset[m])) \
                             and (set(feature_subset[m]).intersection(set(feature_subset[n])) == set()):
                         col = MT.create_col_from_partition(
                             feature_subset[m], feature_subset[n],
                             self.index)
                         if not MT.have_same_col(col, self.matrix):
                             train_data, train_cla = MT.get_data_from_col(
                                 self.train_data, self.train_label, col,
                                 self.index)
                             est = self.estimator(**self.param).fit(
                                 train_data, train_cla)
                             validation_data, validation_cla = MT.get_data_from_col(
                                 self.validate_data, self.validation_y, col,
                                 self.index)
                             if validation_data is None:
                                 score = 0.8
                             else:
                                 score = est.score(validation_data,
                                                   validation_cla)
                             if score >= score_result:
                                 score_result = score
                                 col_result = col
                                 est_result = est
                                 est_weight_result = MT.estimate_weight(
                                     1 - score_result)
                                 feature_subset_m = m
                                 feature_subset_n = n
             if col_result is None:
                 confusion_matrix[np.int(max_index_y),
                                  np.int(max_index_x)] = 0
                 if np.sum(confusion_matrix) == 0:
                     break
             else:
                 break
         try:
             self.matrix = np.hstack((self.matrix, col_result))
             self.predictors.append(est_result)
             self.predictor_weights.append(est_weight_result)
             feature_subset.append(feature_subset[feature_subset_m] +
                                   feature_subset[feature_subset_n])
         except (TypeError, ValueError):
             pass
示例#12
0
 def create_matrix(self, train_data, train_label, validate_data,
                   validate_label, estimator, **param):
     index = {l: i for i, l in enumerate(np.unique(train_label))}
     matrix = None
     predictors = []
     predictor_weights = []
     labels_to_divide = [np.unique(train_label)]
     while len(labels_to_divide) > 0:
         label_set = labels_to_divide.pop(0)
         label_count = len(label_set)
         groups = combinations(range(label_count),
                               np.int(np.ceil(label_count / 2)))
         score_result = 0
         est_result = None
         for group in groups:
             class_1_variety = np.array([label_set[i] for i in group])
             class_2_variety = np.array(
                 [l for l in label_set if l not in class_1_variety])
             class_1_data, class_1_label = MT.get_data_subset(
                 train_data, train_label, class_1_variety)
             class_2_data, class_2_label = MT.get_data_subset(
                 train_data, train_label, class_2_variety)
             class_1_cla = np.ones(len(class_1_data))
             class_2_cla = -np.ones(len(class_2_data))
             train_d = np.vstack((class_1_data, class_2_data))
             train_c = np.hstack((class_1_cla, class_2_cla))
             est = estimator(**param).fit(train_d, train_c)
             class_1_data, class_1_label = MT.get_data_subset(
                 validate_data, validate_label, class_1_variety)
             class_2_data, class_2_label = MT.get_data_subset(
                 validate_data, validate_label, class_2_variety)
             class_1_cla = np.ones(len(class_1_data))
             class_2_cla = -np.ones(len(class_2_data))
             validation_d = np.array([])
             validation_c = np.array([])
             try:
                 validation_d = np.vstack((class_1_data, class_2_data))
                 validation_c = np.hstack((class_1_cla, class_2_cla))
             except Exception:
                 if len(class_1_data) > 0:
                     validation_d = class_1_data
                     validation_c = class_1_cla
                 elif len(class_2_data) > 0:
                     validation_d = class_2_data
                     validation_c = class_2_cla
             if validation_d.shape[0] > 0 and validation_d.shape[1] > 0:
                 score = est.score(validation_d, validation_c)
             else:
                 score = 0.8
             if score >= score_result:
                 score_result = score
                 est_result = est
                 class_1_variety_result = class_1_variety
                 class_2_variety_result = class_2_variety
         new_col = np.zeros((len(index), 1))
         for i in class_1_variety_result:
             new_col[index[i]] = 1
         for i in class_2_variety_result:
             new_col[index[i]] = -1
         if matrix is None:
             matrix = copy.copy(new_col)
         else:
             matrix = np.hstack((matrix, new_col))
         predictors.append(est_result)
         predictor_weights.append(MT.estimate_weight(1 - score_result))
         if len(class_1_variety_result) > 1:
             labels_to_divide.append(class_1_variety_result)
         if len(class_2_variety_result) > 1:
             labels_to_divide.append(class_2_variety_result)
     return matrix, index, predictors, predictor_weights
示例#13
0
 def create_matrix(self, data, label):
     index = {l: i for i, l in enumerate(np.unique(label))}
     matrix = None
     labels_to_divide = [np.unique(label)]
     while len(labels_to_divide) > 0:
         label_set = labels_to_divide.pop(0)
         datas, labels = MT.get_data_subset(data, label, label_set)
         class_1_variety_result, class_2_variety_result = sffs(
             datas, labels, score=Criterion.max_center_distance_score)
         class_1_data_result, class_1_label_result = MT.get_data_subset(
             data, label, class_1_variety_result)
         class_2_data_result, class_2_label_result = MT.get_data_subset(
             data, label, class_2_variety_result)
         class_1_center_result = np.average(class_1_data_result, axis=0)
         class_2_center_result = np.average(class_2_data_result, axis=0)
         belong_to_class_1 = [
             euclidean_distance(x, class_1_center_result) <=
             euclidean_distance(x, class_2_center_result)
             for x in class_1_data_result
         ]
         belong_to_class_2 = [
             MT.euclidean_distance(x, class_2_center_result) <=
             MT.euclidean_distance(x, class_1_center_result)
             for x in class_2_data_result
         ]
         class_1_true_num = {k: 0 for k in class_1_variety_result}
         class_2_true_num = {k: 0 for k in class_2_variety_result}
         for y in class_1_label_result[belong_to_class_1]:
             class_1_true_num[y] += 1
         for y in class_2_label_result[belong_to_class_2]:
             class_2_true_num[y] += 1
         class_1_label_count = {
             k: list(class_1_label_result).count(k)
             for k in class_1_variety_result
         }
         class_2_label_count = {
             k: list(class_2_label_result).count(k)
             for k in class_2_variety_result
         }
         class_1_ratio = {
             k: class_1_true_num[k] / class_1_label_count[k]
             for k in class_1_variety_result
         }
         class_2_ratio = {
             k: -class_2_true_num[k] / class_2_label_count[k]
             for k in class_2_variety_result
         }
         new_col = np.zeros((len(index), 1))
         for i in class_1_ratio:
             new_col[index[i]] = class_1_ratio[i]
         for i in class_2_ratio:
             new_col[index[i]] = class_2_ratio[i]
         if matrix is None:
             matrix = copy.copy(new_col)
         else:
             matrix = np.hstack((matrix, new_col))
         if len(class_1_variety_result) > 1:
             labels_to_divide.append(class_1_variety_result)
         if len(class_2_variety_result) > 1:
             labels_to_divide.append(class_2_variety_result)
     return matrix, index