def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples, )) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples,)) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver="randomized", whiten=True) clf = SVC( gamma="scale", probability=True, random_state=0, decision_function_shape="ovr", ) for preprocessing in [scaler, pca]: pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert predict.shape == (n_samples, ) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) log_proba = pipe.predict_log_proba(X) assert log_proba.shape == (n_samples, n_classes) decision_function = pipe.decision_function(X) assert decision_function.shape == (n_samples, n_classes) pipe.score(X, y)
def resampling(X, Y, r): # print(sorted(Counter(Y).items())) smote_enn = TomekLinks() X_resampled, y_resampled = smote_enn.fit_resample(X, Y) #print(sorted(Counter(y_resampled).items())) return X_resampled, y_resampled # pipeline pipeline = Pipeline([ ('und', RandomUnderSampler()), #('power', preprocessing.PowerTransformer()), ('standardize', preprocessing.StandardScaler()), ('normalizer', preprocessing.Normalizer()), ('lda', LinearDiscriminantAnalysis()), #('logistic', sk.linear_model.SGDClassifier(loss="hinge", eta0=1, learning_rate="constant", penalty='l2')) ('svm', LinearSVC(verbose=0, max_iter=3000, class_weight='balanced')), ]) com_values = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10] for c in com_values: pipeline.set_params(svm__C=c, und__random_state=42).fit(X_train, Y_train) # clf = CalibratedClassifierCV(base_estimator=pipeline, cv=10).fit(X,Y) y_p = pipeline.decision_function(X_dev) y_pred = pipeline.predict(X_dev) print("With:", c) print("Confusion matrix:\n", sk.metrics.confusion_matrix(Y_dev, y_pred)) one = sk.metrics.recall_score(Y_dev, y_pred, pos_label=0) two = sk.metrics.recall_score(Y_dev, y_pred, pos_label=1) print("UAR:", (one + two) / 2, "\n")
class ClassifierCv(object): """class for general classifier""" def __init__(self, data_labels, data_text): """initalizes classifier object -INPUT: -data_labels: series, labels for classes -data_text: series, texts for classification -OUTPUT: -initialized classifier object""" self.text = data_text.reset_index(drop=True) self.labels = data_labels.reset_index(drop=True) if data_labels is not None: # should be none only if unpickle # turn into binary labels self.labels_unique = [label for label in self.labels.unique()] # for some reason in two classes label binareizer gives different output if len(self.labels_unique) == 2: my_label_binarizer = TwoLabelBinarizer() self.labels_bin = my_label_binarizer.fit_transform(self.labels) else: self.labels_bin = label_binarize(self.labels, classes=self.labels_unique) else: self.labels_unique = None self.labels_bin = None # metrics (recall, prec, f1) self.metrics_per_class = None self.metrics_average = None # cv labels self.cv_labels_real = [] self.cv_labels_predicted = [] # roc auc self.fpr = None self.tpr = None self.roc_auc = None # precision-recall curve self.recall = None self.precision = None self.average_precision = None # needed for precison recall, keeps cv results self.y_real = None self.y_proba = None # grid search self.grid_search = None # time self.times_cv = [] self.time_train = [] def text_process(self, mess): """ Default text cleaning. Takes in a string of text, then performs the following: 1. Remove all punctuation 2. Remove all stopwords 3. Returns a list of the cleaned text """ # Check characters to see if they are in punctuation nopunc = [char for char in mess if char not in string.punctuation] # Join the characters again to form the string. nopunc = ''.join(nopunc) # Now just remove any stopwords return [ word for word in nopunc.split() if word.lower() not in stopwords.words('english') ] def prepare_pipeline(self, custom_pipeline=None): """prepares pipeline for model - INPUT: - custom_pipeline: Pipeline, if None, use default pipeline, else input list for sklearn Pipeline -OUTPUT: - initialises sklearn pipeline""" if custom_pipeline is None: self.text_clf = Pipeline([ ('vect', CountVectorizer(analyzer=self.text_process)), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)), ]) else: self.text_clf = Pipeline(custom_pipeline) def perform_random_search(self, param_grid, scoring='f1_weighted', num_cv=3, n_jobs=1, **kwargs): """perform grid search to find best parameters -INPUT: - param_grid: dict or list of dictionaries, Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. This enables searching over any sequence of parameter settings. - scoring: string from http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter - num_cv: int, number of cross-validation iterations - n_jobs: Number of jobs to run in parallel. -OUTPUT: - fitted gridsearch""" self.grid_search = GridSearchCV(self.text_clf, cv=num_cv, scoring=scoring, n_jobs=n_jobs, param_grid=param_grid, **kwargs) self.grid_search.fit(self.text, self.labels) def print_top_random_search(self, num_top=3): """print grid search results -INPUT: -num_top: int, number of top search results to print -OUTPUT: - printed top results""" results = self.grid_search.cv_results_ for i in range(1, num_top + 1): candidates = pd.np.flatnonzero(results['rank_test_score'] == i) for candidate in candidates: print("Model with rank: {0}".format(i)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( results['mean_test_score'][candidate], results['std_test_score'][candidate])) print("Parameters: {0}".format(results['params'][candidate])) print("") def get_top_random_search_parameters(self, num): """get parameters of top grid search -INPUT: - num: int, number of nth top rank parameters -OUTPUT: - dict of nth top parameters""" results = self.grid_search.cv_results_ candidates = pd.np.flatnonzero(results['rank_test_score'] == num) for candidate in candidates: return results['params'][candidate] def prepare_cv(self, n_iter, shuffle=True, random_state=1): """initialises stratified cross-validaton INPUT: - n_iter: int, number of cross validation iterations OUTPUT: - prepares k-fold cross validation object""" self.kf = StratifiedKFold(n_splits=n_iter, shuffle=shuffle, random_state=random_state) self.unique_labels = list(self.labels.unique()) def init_metrics_(self): """ initialise metrics, remove previous training metrics """ self.metrics_per_class = [] self.metrics_average = [] self.fpr = dict() self.tpr = dict() self.roc_auc = dict() self.precision = dict() self.recall = dict() self.average_precision = dict() self.y_proba = dict() self.y_real = dict() self.cv_labels_predicted = [] self.cv_labels_real = [] self.times_cv = [] self.time_train = [] for label_bin in range(len(self.labels_unique)): self.fpr[label_bin] = [] self.tpr[label_bin] = [] self.roc_auc[label_bin] = [] self.precision[label_bin] = [] self.recall[label_bin] = [] self.average_precision[label_bin] = [] self.y_real[label_bin] = [] self.y_proba[label_bin] = [] self.fpr["micro"] = [] self.tpr["micro"] = [] self.roc_auc["micro"] = [] self.precision["micro"] = [] self.recall["micro"] = [] self.average_precision["micro"] = [] self.y_real["micro"] = [] self.y_proba["micro"] = [] def calc_store_rocauc_precrec_(self, classifier_rocauc, proba_method, train_ids, test_ids): """calculate and store ROC AUC and precision recall curve metrics -INPUT: -classifier_roc_auc: sklearn OneVsRest classifier -proba_method: string, classifier method name for predicting label probability -train_ids: list of ids of samples used for training -test_ids: list of ids of samples used for testing -OUTPUT: -stored metrics for ROC AUC and precision recall curve """ y_score = None # roc auc stuff # some classifiers have method decision function, others predict proba to get scores if proba_method == "decision_function": y_score = classifier_rocauc.fit( self.text[train_ids], self.labels_bin[train_ids]).decision_function( self.text[test_ids]) elif proba_method == "predict_proba": y_score = classifier_rocauc.fit( self.text[train_ids], self.labels_bin[train_ids]).predict_proba( list(self.text[test_ids])) if y_score is None: return for i in range(len(self.unique_labels)): fpr_temp, tpr_temp, _ = roc_curve(self.labels_bin[test_ids][:, i], y_score[:, i]) self.fpr[i].append(fpr_temp) self.tpr[i].append(tpr_temp) self.roc_auc[i].append(auc(fpr_temp, tpr_temp)) # precison -recall metrics precision_temp, recall_temp, _ = precision_recall_curve( self.labels_bin[test_ids][:, i], y_score[:, i]) self.precision[i].append(precision_temp) self.recall[i].append(recall_temp) self.average_precision[i].append( average_precision_score(self.labels_bin[test_ids][:, i], y_score[:, i])) self.y_real[i].append(self.labels_bin[test_ids][:, i]) self.y_proba[i].append(y_score[:, i]) # Compute micro-average ROC curve and ROC area fpr_micro_temp, tpr_micro_temp, _ = roc_curve( self.labels_bin[test_ids].ravel(), y_score.ravel()) self.fpr["micro"].append(fpr_micro_temp) self.tpr["micro"].append(tpr_micro_temp) self.roc_auc["micro"].append(auc(fpr_micro_temp, tpr_micro_temp)) # precision recall. A "micro-average": quantifying score on all classes jointly prec_micro_temp, recall_micro_temp, _ = precision_recall_curve( self.labels_bin[test_ids].ravel(), y_score.ravel()) self.precision["micro"].append(prec_micro_temp) self.recall["micro"].append(recall_micro_temp) self.average_precision["micro"] = average_precision_score( self.labels_bin[test_ids], y_score, average="micro") self.y_real["micro"].append(self.labels_bin[test_ids].ravel()) self.y_proba["micro"].append(y_score.ravel()) def get_classifier_proba_method_(self, classifier): """get label probability method of classifier. Some mehtods don't support predict_proba -INPUT: -classifier: sklearn classifier, which probability calculation method is to be detected -OUTPUT: -string with method name """ proba_method = None if callable(getattr(classifier, "decision_function", None)): proba_method = "decision_function" elif callable(getattr(classifier, "predict_proba", None)): proba_method = "predict_proba" return proba_method def train(self, roc_auc=True): """train model, save metrics -INPUT: - roc_auc: boolean, should roc_auc (includeing precision -recall plot) metrics be saved _OUTPUT: - trained model with metrics""" self.init_metrics_() classifier_rocauc = OneVsRestClassifier(self.text_clf) # check if classifier has predict_proba or decison_function method proba_method = self.get_classifier_proba_method_(classifier_rocauc) for train, test in self.kf.split(self.text, self.labels): t0 = time() self.text_clf.fit(self.text[train], self.labels[train]) time_cv = time() - t0 self.times_cv.append(time_cv) labels_predict = self.text_clf.predict(list(self.text[test])) self.cv_labels_predicted.append(labels_predict) self.cv_labels_real.append(self.labels[test]) labels_predict_label = labels_predict # per class metric, not average self.metrics_per_class.append( precision_recall_fscore_support(self.labels[test], labels_predict_label, average=None, labels=self.unique_labels)) self.metrics_average.append( precision_recall_fscore_support(self.labels[test], labels_predict_label, average='weighted', labels=self.unique_labels)) if roc_auc: self.calc_store_rocauc_precrec_(classifier_rocauc, proba_method, train, test) self.metrics_df = pd.DataFrame(self.metrics_per_class) self.metrics_average_df = pd.DataFrame(self.metrics_average) # finally make model with all training data t0 = time() self.text_clf.fit(self.text, self.labels) time_train = time() - t0 self.time_train.append(time_train) def predict(self, text_list, proba=False): """"predict labels based on trained classifier - INPUT: - text_list: list of texts which label will be predicted - proba: boolean, if true probability will be predicted - OUTPUT: - dataframe labels (with probas if proba True) """ if proba: probas = [] if callable(getattr(self.text_clf, "predict_proba", None)): probas = self.text_clf.predict_proba(text_list) if callable(getattr(self.text_clf, "decision_function", None)): probas = self.text_clf.decision_function(text_list) return pd.DataFrame(probas, columns=self.unique_labels) return self.text_clf.predict(text_list) def get_one_metric_cv(self, metric_name, average=False): """"extract one metric from precision_recall_fscore_support to compare it between classes - INPUT: - metric_name: str, name of the metric to be extracted, on of the following: 'precision', 'recall', 'f1', 'support' - average: boolean, True if data is average above all classes, else if per class: True - OUTPUT: - dataframe with metric from cross validation""" ind = 0 if metric_name == 'precision': ind = 0 if metric_name == 'recall': ind = 1 if metric_name == 'f1': ind = 2 if metric_name == 'support': ind = 3 if average: return pd.DataFrame( self.metrics_average_df[ind].values.tolist()).transpose() return pd.DataFrame(self.metrics_df[ind].values.tolist(), columns=self.unique_labels).transpose() def make_metric_boxplot(self, metric, savefile=None, average=False, title=None, x_tick_rotation=45): """function to make metric boxplot to compare cross validation results between classes - INPUT: - metric: metric to be used for plot - savefile: path to file if plot is to be saved, else None - average: data used is average all classes (if False) or per class (True) - title: title of the plot, None if no title is to be used - OUTPUT: - plot of metric from cross validation results""" metric_df = self.get_one_metric_cv(metric, average) # print results print("MEDIAN") print(metric_df.median(axis=1)) print('MEAN') print(metric_df.mean(axis=1)) plt.boxplot(metric_df) if title is not None: plt.title(title) # add class labels if not average if not average: plt.xticks([i + 1 for i in range(len(self.unique_labels))], self.unique_labels, rotation=x_tick_rotation) # set y-axis from 0 to 1 axes = plt.gca() axes.set_ylim([0, 1]) if savefile is not None: plt.savefig(savefile, bbox_inches='tight') def save_times(self, time_metrics_path, algorithm_name): """save times from training -INPUT: -time_metrics_path: str, path where to save time files - algorithm_name: str, name of the algorithm to add to file nime -OUTPUT: - save training times (from cv and final fit)""" df_time_cv = pd.DataFrame({'cv_times': self.times_cv}) df_time_train = pd.DataFrame({'train_time': self.time_train}) df_time_cv.to_csv(os.path.join(time_metrics_path, algorithm_name + '_time_cv.xlsx'), index=False) df_time_train.to_csv(os.path.join(time_metrics_path, algorithm_name + '_time_train.xlsx'), index=False) def train_save_metrics(self, pipeline, metric_name, algorithm_name, plot_path=None, metrics_path=None, roc_auc_average=True, roc_auc=True, roc_auc_plot_cat_index=None, num_cv=10, random_state=1): """wrapper function to train model quickly and less code -INPUT: - pipeline: list, skleanr pipeline - metric_name: metric which plot is to be saved, must be one of these 'f1', 'precision' ,'recall', 'support' - algorithm_name: name of the algorithm used, uses this to plot titel and file name - plot_path: path of the plot to be saved - metrics_path: path of the metrics files to be saved - num_cv: number of cross validations - roc_auc_averge: boolean, plot average ROC AUC above all classes - roc_auc_plot_cat_index: int, integer of category which ROC_AUC to plot - OUTPUT: - plot of metric specified - ROC_AUC plot - metrics files (per class and average)""" self.prepare_pipeline(pipeline) self.prepare_cv(num_cv, random_state=random_state) self.train(roc_auc=roc_auc) self.algorithm_name = algorithm_name if plot_path is not None: metric_plot_path = os.path.join( plot_path, algorithm_name + '_' + metric_name + '.png') roc_auc_plot_path = os.path.join(plot_path, algorithm_name + 'ROC_AUC.png') precision_recall_plot_path = os.path.join( plot_path, algorithm_name + 'prec_recall.png') else: metric_plot_path = None roc_auc_plot_path = None precision_recall_plot_path = None if metrics_path is not None: average_metrics_path = os.path.join( metrics_path, algorithm_name + '_average.xlsx') metrics_df_path = os.path.join(metrics_path, algorithm_name + '.xlsx') # save metrics self.metrics_average_df.to_excel(average_metrics_path, index=False) self.metrics_df.to_excel(metrics_df_path, index=False) self.save_times(metrics_path, self.algorithm_name) self.make_metric_boxplot(metric_name, metric_plot_path, True, '_'.join([algorithm_name, metric_name])) try: # at least nearest centroid doesn't give enough metrics to make roc auc, so fail silently self.make_roc_auc_plot(savefile=roc_auc_plot_path, category_index=roc_auc_plot_cat_index, average=roc_auc_average, title=algorithm_name) self.make_precision_recall_plot( savefile=precision_recall_plot_path, category_index=roc_auc_plot_cat_index, average=roc_auc_average, title=algorithm_name) except: print("Failed to generate roc_auc/precision_recall plot") pass def make_roc_auc_plot(self, savefile=None, category_index=0, average=False, title=""): """ make and save ROC AUC plot - INPUT: - savefile: string filename and path where to save - category_index: int, index of category for which information is about to be retreieved - average: boolean: instead of category, plot whole model macro average? - title: string - OUTPUT: - plot which is saved to path savefile """ tprs = [] mean_fpr = np.linspace(0, 1, 100) if average: fpr = dict() tpr = dict() roc_auc = dict() plt.figure() for j in range(self.kf.n_splits): # First aggregate all false positive rates all_fpr = np.unique( np.concatenate([ self.fpr[i][j] for i in range(len(self.labels_unique)) ])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(len(self.labels_unique)): mean_tpr += interp(all_fpr, self.fpr[i][j], self.tpr[i][j]) # Finally average it and compute AUC mean_tpr /= len(self.labels_unique) fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # needed for average calculation on plot tprs.append(interp(mean_fpr, all_fpr, mean_tpr)) tprs[-1][0] = 0.0 # Plot all ROC curves plt.plot(fpr["macro"], tpr["macro"], label=r'ROC fold %d (AUC = %0.2f)' % (j, roc_auc["macro"]), lw=1, alpha=0.3) std_auc = np.std(roc_auc['macro']) plt_title_category = 'macro' else: # for some specific category for i in range(self.kf.n_splits): tprs.append( interp(mean_fpr, self.fpr[category_index][i], self.tpr[category_index][i])) tprs[-1][0] = 0.0 plt.plot(self.fpr[category_index][i], self.tpr[category_index][i], lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, self.roc_auc[category_index][i])) i += 1 plt.title("ROC AUC category:" + self.labels_unique[category_index] + "_" + title) std_auc = np.std(self.roc_auc[category_index]) plt_title_category = self.labels_unique[category_index] mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') plt.title("ROC AUC:" + plt_title_category + "_" + title) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(loc="lower right") if savefile is not None: plt.savefig(savefile, bbox_inches='tight') plt.show() def make_precision_recall_plot(self, average=True, category_index=None, title='', savefile=None): """make precision recall plot based on cv results - INPUT: -average: boolean, show whole model average precision recall plot or some label, if False label index must be set - category_index: integer, index of label which precision recall plot is to be showed - title: string, title for plot - savefile: string, path+name of the file to be saved. If None it is not saved - OUTPUT: - plot and if savefile, saved to path """ plt.figure() if average: title_category = category_index = "micro" else: # some category specific title_category = self.labels_unique[category_index] for i in range(len(self.recall[0])): plt.step(self.recall[category_index][i], self.precision[category_index][i], lw=1, alpha=0.3, where='post', label='Fold %d AUC = %0.2f' % (i, auc(self.recall[category_index][i], self.precision[category_index][i]))) y_real = np.concatenate(self.y_real[category_index]) y_proba = np.concatenate(self.y_proba[category_index]) avg_precision, avg_recall, _ = precision_recall_curve(y_real, y_proba) lab = 'Overall AUC=%.4f' % (auc(avg_recall, avg_precision)) plt.step(avg_recall, avg_precision, label=lab, lw=2, color='black') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.legend(loc="lower right") plt.title('Precision-recall plot: ' + title_category + "_" + title) if savefile is not None: plt.savefig(savefile) plt.show() def make_average_auc_boxplot(self, title=None, savefile=None): """make boxplot of each cv -INPUT: - title: string, title of the string - savefile: string, path to plot file - OUTPUT: - plot (saved if savefile is not None)""" fpr = dict() tpr = dict() roc_aucs = [] plt.figure() for j in range(self.kf.n_splits): # First aggregate all false positive rates all_fpr = np.unique( np.concatenate( [self.fpr[i][j] for i in range(len(self.labels_unique))])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(len(self.labels_unique)): mean_tpr += interp(all_fpr, self.fpr[i][j], self.tpr[i][j]) # Finally average it and compute AUC mean_tpr /= len(self.labels_unique) fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_aucs.append(auc(fpr["macro"], tpr["macro"])) plt.boxplot(roc_aucs) axes = plt.gca() axes.set_ylim([0, 1]) if title is not None: plt.title(title) if savefile is not None: plt.savefig(savefile, bbox_inches='tight') plt.show() def make_confusion_matrix(self, use_evaluation_data=False): """makes confusion matrix based on evaluation dataset -INPUT: -use_evaluation_data: boolean, if True use evaluation data instead of training data -OUTPUT: - confusion matrix """ if use_evaluation_data: y_real = self.labels_eval_real y_pred = self.labels_eval_predicted else: labels_real = self.cv_labels_real labels_predicted = self.cv_labels_predicted y_real = [item for sublist in labels_real for item in sublist] y_pred = [item for sublist in labels_predicted for item in sublist] cm = confusion_matrix(y_real, y_pred, labels=self.labels_unique) return cm def plot_confusion_matrix(self, cm=None, classes=None, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues, use_evaluation_data=False, savefile=None): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. -INPUT: -cm: matrix, confusion matrix -classes: list, list of classes -normalize: boolean, normalize by dividing cm col sums -title: str, titel of plot -cmap: cmap, colormap for plot -use_evaluation_data: boolean, if true use evaluation data -OUTPUT: -plot of confusion matrix """ if classes is None: classes = self.labels_unique if cm is None: cm = self.make_confusion_matrix( use_evaluation_data=use_evaluation_data) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') print(cm) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') if savefile is not None: plt.savefig(savefile) def predict_evaluation_set(self, texts_eval, labels_eval_real): """predict labels of evaluation set -INPUT: -texts_eval: list, list of texts to be used for evaluation -labels_eval_real: list, list of labels for texts_eval -OUTPUT: -initlize data for evaluation""" self.labels_eval_real = labels_eval_real self.labels_eval_predicted = self.text_clf.predict(list(texts_eval)) def calc_evaluation_report(self, texts_eval, labels_eval_real, savefile=None): """return evaluation metrics -INPUT: -texts_eval: list, list of texts to be used for evaluation - labels_eval_real: list of labels to be used for evaluation -savefile: str, path for saving metrics files -OUTPUT: - """ self.predict_evaluation_set(texts_eval, labels_eval_real) if savefile is not None: eval_prec_rec_f1 = precision_recall_fscore_support( self.labels_eval_real, self.labels_eval_predicted, labels=self.labels_unique) df_eval_metrics = pd.DataFrame( np.vstack(eval_prec_rec_f1), index=['precision', 'recall', 'f1', 'support'], columns=self.labels_unique) eval_prec_rec_f1_average = precision_recall_fscore_support( self.labels_eval_real, self.labels_eval_predicted, average="weighted", labels=self.labels_unique) df_eval_metrics_average = pd.DataFrame( np.vstack(eval_prec_rec_f1_average), index=['precision', 'recall', 'f1', 'support'], columns=['weighted']) df_eval_metrics.to_csv(savefile + "_" + self.algorithm_name + ".csv", index=False) df_eval_metrics_average.to_csv( savefile + "_" + self.algorithm_name + "_average.csv", index=False) return classification_report(self.labels_eval_real, self.labels_eval_predicted) def plot_acc_vs_nsamples(self, metric_name='f1', trendline=True, savefile=None): """plot number of samples vs accuracy -INPUT: -metric_name: string, name of metric to be used for accuracy (f1, precision, recall -trendline: boolean, add trendline to plot -savefile: string, if exists saves plot with name this name -OUTPUT: plot with number of samples vs accuracy """ metric = pd.DataFrame( self.get_one_metric_cv(metric_name=metric_name).mean(1)) nsamples = pd.DataFrame( pd.DataFrame(self.labels).ix[:, 0].value_counts()) metric = metric.merge(nsamples, left_index=True, right_index=True) metric.columns = ['acc', 'nsamples'] metric.plot.scatter(x='nsamples', y='acc') z = np.polyfit(metric['nsamples'], metric['acc'], 1) p = np.poly1d(z) if trendline: plt.plot(metric['nsamples'], p(metric['nsamples']), "r--") for i, txt in enumerate(metric.index): plt.annotate(txt, (metric['nsamples'][i], p(metric['nsamples'])[i])) plt.xlabel(metric_name) plt.ylabel('number of samples') if savefile: plt.savefig(savefile) def pickle(self, filename): """save class instance to file -INPUT: -filename: str, filename to save ClassifierCv object -OUTPUT: -pickled ClassifierCv object """ f = open(filename, 'wb') pickle.dump(self, f, pickle.HIGHEST_PROTOCOL) f.close() @staticmethod def unpickle(filename): """read class instance from file""" with open(filename, 'rb') as f: return pickle.load(f)