def train(self):
     for (cate_name, cate_no) in DC_CATEGORY_NO_MAPPING.iteritems():
         X_train = Loader.load_vectors([_ for _ in self._training_data if _._label == cate_no])
         if not X_train:
             continue
         X_train = np.array(np.mat(';'.join(X_train)))
         self.classifiers[cate_name].fit(X_train)
    def evaluate(self, train_test_split_rate=.8, random_seed=30):
        for (cate_name, cate_no) in DC_CATEGORY_NO_MAPPING.iteritems():
            print 'evaluate', cate_name, 'with label', cate_no
            print '-'*40
            inner_data = Loader.load_vectors([_ for _ in self._training_data if _._label == cate_no])
            outer_data = Loader.load_vectors([_ for _ in self._training_data if _._label != cate_no])

            inner_data_index = [i for i in range(len(self._training_data)) if self._training_data[i]._label == cate_no]
            outer_data_index = [i for i in range(len(self._training_data)) if self._training_data[i]._label != cate_no]

            # print 'inner_data_index:', inner_data_index
            # print 'outer_data_index:', outer_data_index

            if not inner_data:
                continue

            np.random.seed(random_seed)
            np.random.shuffle(inner_data)
            split_point = int(len(inner_data)*train_test_split_rate)
            train_data = inner_data[:split_point]
            test_data = inner_data[split_point:]
            outlier_data = outer_data

            print 'total ground truth:', len(inner_data), \
                ', train_data:', len(train_data), \
                ', test_data:', len(test_data), \
                ', outlier_data:', len(outlier_data)
            
            if not train_data or not test_data or not outlier_data:
                print len(train_data), len(test_data), len(outlier_data)
                print 'not enough data to evaluate for', cate_name
                continue

            # transfer to numpy
            train_data = np.array(np.mat(';'.join([_ for _ in train_data])))
            test_data = np.array(np.mat(';'.join([_ for _ in test_data])))
            outlier_data = np.array(np.mat(';'.join([_ for _ in outlier_data])))

            self.classifiers[cate_name].fit(train_data)

            y_pred_train = self.classifiers[cate_name].predict(train_data)
            y_pred_test = self.classifiers[cate_name].predict(test_data)
            y_pred_outliers = self.classifiers[cate_name].predict(outlier_data)
            n_error_train = y_pred_train[y_pred_train == -1].size
            n_error_test = y_pred_test[y_pred_test == -1].size
            n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

            n_true_train = y_pred_train[y_pred_train == 1].size
            n_true_test = y_pred_test[y_pred_test == 1].size
            n_true_outliers = y_pred_outliers[y_pred_outliers == -1].size

            # print 'error_test index', [inner_data_index[split_point:][_] for _ in [i for i in range(len(y_pred_test)) if y_pred_test[i] == -1]]
            # print 'error_outlier index', [outer_data_index[_] for _ in [i for i in range(len(y_pred_outliers)) if y_pred_outliers[i] == -1]]

            print 'error_train:', str(n_error_train)+'/'+str(len(train_data)), \
                ', error_test:', str(n_error_test)+'/'+str(len(test_data)), \
                ', error_outlier:', str(n_error_outliers)+'/'+str(len(outlier_data))

            print 'precision:', str(n_true_test)+'/'+str(n_true_test+n_error_outliers)
            print 'recall:', str(n_true_test)+'/'+str(len(test_data))



            # print cate_name+': training data:', json.dumps([[_._content for _ in self._training_data if _._label == cate_no][i] for i in range(len(list(y_pred_train))) if list(y_pred_train)[i] == 1], indent=4)
            # # """
            # print cate_name+': error_train:', json.dumps([[_._content for _ in self._training_data if _._label == cate_no][i] for i in range(len(list(y_pred_train))) if list(y_pred_train)[i] == -1], indent=4)
            # print cate_name+': error_test:', json.dumps([[_._content for _ in self._training_data if _._label == cate_no][i+len(train_data)] for i in range(len(list(y_pred_test))) if list(y_pred_test)[i] == -1], indent=4)
            # print cate_name+': error_outlier:', json.dumps([[_._content for _ in self._training_data if _._label != cate_no][i] for i in range(len(list(y_pred_outliers))) if list(y_pred_outliers)[i] == 1], indent=4)
            # # """
            
            # print 'error_train:', json.dumps([train_data[i] for i in range(len(list(y_pred_train))) if list(y_pred_train)[i] == -1], indent=4)
            # print 'error_test:', json.dumps([test_data[i] for i in range(len(list(y_pred_test))) if list(y_pred_test)[i] == -1], indent=4)
            # print 'error_outlier:', json.dumps([outlier_data[i] for i in range(len(list(y_pred_outliers))) if list(y_pred_outliers)[i] == 1], indent=4)

            print '\n\n\n'