def __init__(self, features, labels, standardize=False): self.standardize = standardize self.means = None self.stdevs = None if standardize: self.features = self._standardize(features) else: self.features = features self.labels = labels self.feature_cols = features.column_names() labeled_feature_vector = XFrame(features) label_col = 'label' # TODO what if there is a feature with this name ? feature_cols = self.feature_cols # need local reference labeled_feature_vector[label_col] = labels def build_labeled_features(row): label = row[label_col] features =[row[col] for col in feature_cols] return LabeledPoint(label, features) self.labeled_feature_vector = labeled_feature_vector.apply(build_labeled_features)
def _base_evaluate(self, data, labels): """ Evaluate the performance of the classifier. Use the data to make predictions, then test the effectiveness of the predictions against the labels. The data must be a collection of items (XArray of SenseVector). Returns ------- out : A list of: - overall correct prediction proportion - true positive proportion - true negative proportion - false positive proportion - false negative proportion """ results = XFrame() predictions = self._base_predict(data) results['predicted'] = predictions results['actual'] = labels # print results def evaluate(row): prediction = row['predicted'] actual = row['actual'] return {'correct': 1 if prediction == actual else 0, 'true_pos': 1 if prediction == 1 and actual == 1 else 0, 'true_neg': 1 if prediction == 0 and actual == 0 else 0, 'false_pos': 1 if prediction == 1 and actual == 0 else 0, 'false_neg': 1 if prediction == 0 and actual == 1 else 0, 'positive': 1 if actual == 1 else 0, 'negative': 1 if actual == 0 else 0 } score = results.apply(evaluate) def sum_item(item): return score.apply(lambda x: x[item]).sum() all_scores = float(len(labels)) correct = float(sum_item('correct')) tp = float(sum_item('true_pos')) tn = float(sum_item('true_neg')) fp = float(sum_item('false_pos')) fn = float(sum_item('false_neg')) pos = float(sum_item('positive')) neg = float(sum_item('negative')) # precision = true pos / (true pos + false pos) # recall = true pos / (true pos + false neg) # true pos rate = true pos / positive # false pos rate = false pos / negative result = {} result['correct'] = correct result['true_pos'] = tp result['true_neg'] = tn result['false_pos'] = fp result['false_neg'] = fn result['all'] = all_scores result['accuracy'] = correct / all_scores if all_scores > 0 else float('nan') result['precision'] = tp / (tp + fp) if (tp + fp) > 0 else float('nan') result['recall'] = tp / (tp + fn) if (tp + fn) > 0 else float('nan') result['tpr'] = tp / pos if pos > 0 else float('nan') result['fpr'] = fp / neg if neg > 0 else float('nan') return result