def train(self): for (cate_name, cate_no) in DC_CATEGORY_NO_MAPPING.iteritems(): X_train = Loader.load_vectors([_ for _ in self._training_data if _._label == cate_no]) if not X_train: continue X_train = np.array(np.mat(';'.join(X_train))) self.classifiers[cate_name].fit(X_train)
def evaluate(self, train_test_split_rate=.8, random_seed=30): for (cate_name, cate_no) in DC_CATEGORY_NO_MAPPING.iteritems(): print 'evaluate', cate_name, 'with label', cate_no print '-'*40 inner_data = Loader.load_vectors([_ for _ in self._training_data if _._label == cate_no]) outer_data = Loader.load_vectors([_ for _ in self._training_data if _._label != cate_no]) inner_data_index = [i for i in range(len(self._training_data)) if self._training_data[i]._label == cate_no] outer_data_index = [i for i in range(len(self._training_data)) if self._training_data[i]._label != cate_no] # print 'inner_data_index:', inner_data_index # print 'outer_data_index:', outer_data_index if not inner_data: continue np.random.seed(random_seed) np.random.shuffle(inner_data) split_point = int(len(inner_data)*train_test_split_rate) train_data = inner_data[:split_point] test_data = inner_data[split_point:] outlier_data = outer_data print 'total ground truth:', len(inner_data), \ ', train_data:', len(train_data), \ ', test_data:', len(test_data), \ ', outlier_data:', len(outlier_data) if not train_data or not test_data or not outlier_data: print len(train_data), len(test_data), len(outlier_data) print 'not enough data to evaluate for', cate_name continue # transfer to numpy train_data = np.array(np.mat(';'.join([_ for _ in train_data]))) test_data = np.array(np.mat(';'.join([_ for _ in test_data]))) outlier_data = np.array(np.mat(';'.join([_ for _ in outlier_data]))) self.classifiers[cate_name].fit(train_data) y_pred_train = self.classifiers[cate_name].predict(train_data) y_pred_test = self.classifiers[cate_name].predict(test_data) y_pred_outliers = self.classifiers[cate_name].predict(outlier_data) n_error_train = y_pred_train[y_pred_train == -1].size n_error_test = y_pred_test[y_pred_test == -1].size n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size n_true_train = y_pred_train[y_pred_train == 1].size n_true_test = y_pred_test[y_pred_test == 1].size n_true_outliers = y_pred_outliers[y_pred_outliers == -1].size # print 'error_test index', [inner_data_index[split_point:][_] for _ in [i for i in range(len(y_pred_test)) if y_pred_test[i] == -1]] # print 'error_outlier index', [outer_data_index[_] for _ in [i for i in range(len(y_pred_outliers)) if y_pred_outliers[i] == -1]] print 'error_train:', str(n_error_train)+'/'+str(len(train_data)), \ ', error_test:', str(n_error_test)+'/'+str(len(test_data)), \ ', error_outlier:', str(n_error_outliers)+'/'+str(len(outlier_data)) print 'precision:', str(n_true_test)+'/'+str(n_true_test+n_error_outliers) print 'recall:', str(n_true_test)+'/'+str(len(test_data)) # print cate_name+': training data:', json.dumps([[_._content for _ in self._training_data if _._label == cate_no][i] for i in range(len(list(y_pred_train))) if list(y_pred_train)[i] == 1], indent=4) # # """ # print cate_name+': error_train:', json.dumps([[_._content for _ in self._training_data if _._label == cate_no][i] for i in range(len(list(y_pred_train))) if list(y_pred_train)[i] == -1], indent=4) # print cate_name+': error_test:', json.dumps([[_._content for _ in self._training_data if _._label == cate_no][i+len(train_data)] for i in range(len(list(y_pred_test))) if list(y_pred_test)[i] == -1], indent=4) # print cate_name+': error_outlier:', json.dumps([[_._content for _ in self._training_data if _._label != cate_no][i] for i in range(len(list(y_pred_outliers))) if list(y_pred_outliers)[i] == 1], indent=4) # # """ # print 'error_train:', json.dumps([train_data[i] for i in range(len(list(y_pred_train))) if list(y_pred_train)[i] == -1], indent=4) # print 'error_test:', json.dumps([test_data[i] for i in range(len(list(y_pred_test))) if list(y_pred_test)[i] == -1], indent=4) # print 'error_outlier:', json.dumps([outlier_data[i] for i in range(len(list(y_pred_outliers))) if list(y_pred_outliers)[i] == 1], indent=4) print '\n\n\n'