def test_pandas_confusion_normalized(): y_true = [2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2] y_pred = [0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2] cm = ConfusionMatrix(y_true, y_pred) assert isinstance(cm, pdml.confusion_matrix.LabeledConfusionMatrix) df = cm.to_dataframe() df_norm = cm.to_dataframe(normalized=True) assert (df_norm.sum(axis=1).sum() == len(df))
def test_pandas_confusion_normalized(self): y_true = [2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2] y_pred = [0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2] cm = ConfusionMatrix(y_true, y_pred) assert isinstance(cm, pdml.confusion_matrix.LabeledConfusionMatrix) df = cm.to_dataframe() df_norm = cm.to_dataframe(normalized=True) assert(df_norm.sum(axis=1).sum() == len(df))
def calculate_accuracy(csv_filename): # Loading csv information into a data frame data = pd.read_csv(csv_filename) # assigning actual sentiment data to y_test y_test = data['Actual_Statement'] # assigning predicted sentiment data to y_pred y_pred = data['Prediction'] score = accuracy_score(y_test, y_pred) # calling accuracy_score method to get the accuracy_score print 'Accuracy Score : ', score # calling confusion_matrix method from pandas_ml to show the output confusion_matrix = ConfusionMatrix(y_test, y_pred) output = confusion_matrix.to_dataframe() writer = pd.ExcelWriter("azure_text_confusion_matrix_output.xlsx") output.to_excel(writer, startrow=4, startcol=0) Acuracy_Score = 'Accuracy Score : ' + str(score) worksheet = writer.sheets['Sheet1'] worksheet.write(1, 0, Acuracy_Score) writer.save() print("Confusion matrix:\n%s" % confusion_matrix)
def test_pandas_confusion_normalized_issue1(self): # should insure issue 1 is fixed # see http://stackoverflow.com/questions/19233771/sklearn-plot-confusion-matrix-with-labels/31720054#31720054 y_true = ['business', 'business', 'business', 'business', 'business', 'business', 'business', 'business', 'business', 'business', 'business', 'business', 'business', 'business', 'business', 'business', 'business', 'business', 'business', 'business'] y_pred = ['health', 'business', 'business', 'business', 'business', 'business', 'health', 'health', 'business', 'business', 'business', 'business', 'business', 'business', 'business', 'business', 'health', 'health', 'business', 'health'] cm = ConfusionMatrix(y_true, y_pred) assert isinstance(cm, pdml.confusion_matrix.BinaryConfusionMatrix) df = cm.to_dataframe() df_norm = cm.to_dataframe(normalized=True) assert(df_norm.sum(axis=1, skipna=False).fillna(1).sum() == len(df))
def on_epoch_end(self, epoch, logs=None): y_true, y_pred = [], [] for i in range(self.validation_steps): X_batch, y_true_batch = next(self.validation_data) y_pred_batch = self.model.predict(X_batch) y_true.extend(y_true_batch) y_pred.extend(y_pred_batch) y_true = np.float32(y_true) y_pred = np.float32(y_pred) val_loss = log_loss(y_true, y_pred) # map integer labels to strings y_true = list(y_true.argmax(axis=-1)) y_pred = list(y_pred.argmax(axis=-1)) y_true = [self.int2label[y] for y in y_true] y_pred = [self.int2label[y] for y in y_pred] confusion = ConfusionMatrix(y_true, y_pred) accs = self.accuracies(confusion._df_confusion.values) acc = self.accuracy(confusion._df_confusion.values) # same for wanted words y_true = [y if y in self.wanted_words else '_unknown_' for y in y_true] y_pred = [y if y in self.wanted_words else '_unknown_' for y in y_pred] wanted_words_confusion = ConfusionMatrix(y_true, y_pred) wanted_accs = self.accuracies(wanted_words_confusion._df_confusion.values) acc_line = ('\n[%03d]: val_categorical_accuracy: %.2f, ' 'val_mean_categorical_accuracy_wanted: %.2f') % ( epoch, acc, wanted_accs.mean()) # noqa with open('confusion_matrix.txt', 'a') as f: f.write('%s\n' % acc_line) f.write(confusion.to_dataframe().to_string()) with open('wanted_confusion_matrix.txt', 'a') as f: f.write('%s\n' % acc_line) f.write(wanted_words_confusion.to_dataframe().to_string()) logs['val_loss'] = val_loss logs['val_categorical_accuracy'] = acc logs['val_mean_categorical_accuracy_all'] = accs.mean() logs['val_mean_categorical_accuracy_wanted'] = wanted_accs.mean()
def gen_conf_mat(y_pred, y_true): """Generate confusion matrix with the appropriate naming conventions""" #ipdb.set_trace() inv_label_dict = { 0: 'background', 63: 'liver', 126: 'l_kidney', 189: 'r_kidney', 252: 'spleen' } #Rename columne for analysis tmp_conf_mat = ConfusionMatrix(y_true, y_pred) tmp_conf_mat = tmp_conf_mat.to_dataframe() filt_df_dict = { k: v for k, v in inv_label_dict.items() if k in tmp_conf_mat.columns.tolist() } tmp_conf_mat.rename(filt_df_dict, axis=0, inplace=True) tmp_conf_mat.rename(filt_df_dict, axis=1, inplace=True) return tmp_conf_mat
def gen_test_report(clf, y_test, X_test, args, sub_str='_test_report_per_cls'): #Writing test report to file y_true, y_pred = y_test, clf.predict(X_test) #ipdb.set_trace() #Producing pandas ML confusion matrix and statistical summary tmp_confusion_matrix = ConfusionMatrix(y_true, y_pred) #tmp_stat_summary=tmp_confusion_matrix.stats() tmp_confusion_matrix = tmp_confusion_matrix.to_dataframe() tmp_confusion_matrix.to_csv( os.path.join(args.output_model_dir, args.f_nm_str + '_confusion_matrix')) #Generation dictionary for analysis #with open(os.path.join(args.output_model_dir,args.f_nm_str+'_descriptive_stat.pickle')) as fb: # pickle.dump(tmp_stat_summary,fb) file_nm_test_report = args.f_nm_str + sub_str #Generating report on test data for analysis test_report_raw = classification_report(y_true, y_pred, output_dict=True) test_report_df = pd.DataFrame(test_report_raw).transpose() #Writing best model to file directory for models print(test_report_raw) test_report_df.to_csv( os.path.join(args.output_model_dir, file_nm_test_report))
df = pd.read_csv('./data/analysis.csv') for difficulty in [0,1,2]: sub_df = df.loc[df['difficulty']==difficulty] cmat = ConfusionMatrix(sub_df['predicted'],sub_df['actual']) #cmat.print_stats() #print dir(cmat) fig = plt.figure() ax = fig.add_subplot(111) sns.heatmap(cmat.to_dataframe(),square=True, annot=True, fmt="d", cmap=plt.cm.bone_r) plt.xticks(rotation=90) plt.yticks(rotation=0) a=[item.get_text() for item in ax.get_yticklabels()] b=[item.get_text() for item in ax.get_xticklabels()] conversion = {'sympathomimetic': "Sympathomimetic", 'sedative_hypnotic': "Sedative-Hypnotic", 'cholinergic':"Cholinergic", 'anticholinergic':"Anticholinergic", 'opioid':"Opioid", "serotonin_syndrome":"Serotonin Syndrome"} new_ticklabels = [conversion[label] for label in b] ax.set_yticklabels(new_ticklabels)
import seaborn as sns import matplotlib.pyplot as plt from pandas_ml import ConfusionMatrix df = pd.read_csv('./data/analysis.csv') cmat = ConfusionMatrix(df['predicted'], df['actual']) #cmat.print_stats() #print dir(cmat) fig = plt.figure() ax = fig.add_subplot(111) sns.heatmap(cmat.to_dataframe(), square=True, annot=True, fmt="d", cmap=plt.cm.bone_r) plt.xticks(rotation=90) plt.yticks(rotation=0) a = [item.get_text() for item in ax.get_yticklabels()] b = [item.get_text() for item in ax.get_xticklabels()] conversion = { 'sympathomimetic': "Sympathomimetic", 'sedative_hypnotic': "Sedative-Hypnotic", 'cholinergic': "Cholinergic", 'anticholinergic': "Anticholinergic", 'opioid': "Opioid",
df['actual_class'], r_bins = pd.qcut(df['rome'], 10, labels=list(range(1, 11)), retbins=True) df['predic_class'] = pd.qcut(df['predicted'], 10, labels=list(range(1, 11))) # df['predic_class'] = pd.cut(df['predicted'], bins=r_bins, labels=list(range(1, 11))) cm = ConfusionMatrix(df['actual_class'].to_list(), df['predic_class'].to_list()) cm.print_stats() statdict = cm.stats() cm_stats = statdict['class'] matrix = cm.to_dataframe() matrix.index.rename('ROME decile', inplace=True) matrix.columns.rename('ROME$_\mathrm{NN}$ decile', inplace=True) plt.close() # sns.heatmap(matrix / (len(predicted)//10) , cmap='Greys', annot=matrix, fmt='d') ax = sns.heatmap(matrix / (len(predicted) // 10), annot=matrix, fmt='d', cmap='gray_r', vmin=0.0, vmax=0.56) cbar = ax.collections[0].colorbar cbar.ax.tick_params(labelsize=20) cbar.set_label("Percentage in decile [1]", fontsize=23) ax.set_xticklabels(ax.get_xticklabels(), fontsize=20)
class EncoderWithClassifier: def __init__(self, encoder_model, name='', activation='sigmoid', loss_function='categorical_crossentropy', optimizer=SGD(lr=0.01), use_last_dim_as_classifier_dim=True, classifier_dim=None): self.__encoder_model = encoder_model self.__name = name self.__activation = activation self.__loss_function = loss_function self.__optimizer = optimizer self.__trained = True self.__validated = True self.__use_last_dim_as_classifier_dim = use_last_dim_as_classifier_dim self.__classifier_dim = classifier_dim self.__validateDimensions() self.__generateClassifier() self.__compile() def __validateDimensions(self): if not self.__use_last_dim_as_classifier_dim and self.__classifier_dim <= 0: raise ValueError( "The number of neurons in a layer (classifier_dim) must be greater than zero" ) def __generateClassifier(self): ae_output = self.__encoder_model.layers[-1].output dim = None if not self.__use_last_dim_as_classifier_dim: dim = self.__classifier_dim else: dim = self.__encoder_model.layers[-1].units self.__classifier_layers = Dense(dim, activation=self.__activation, name='classifier')(ae_output) self.__classifier = Model(inputs=[self.__encoder_model.input], outputs=[self.__classifier_layers]) def __compile(self): self.__classifier.compile(loss=self.__loss_function, optimizer=self.__optimizer, metrics=['acc']) self.__summary = self.__classifier.summary() def __stats(self, path=None): file_pattern = path + self.__name + '.{0}.{1}' classifier_predictions_max = np.argmax(self.__classifier_predictions, axis=1) Ymax = np.argmax(self.__eval_label, axis=1) self.__confusion_matrix = ConfusionMatrix(Ymax, classifier_predictions_max) self.__status_dump(file_pattern, self.__confusion_matrix, html=True, string=True, pickle=True, stats_as_txt=True, latex=True) def __status_dump(self, file_pattern, confusion_matrix, html=False, string=False, pickle=False, stats_as_txt=False, latex=False): dataframe = self.__confusion_matrix.to_dataframe() if html: with open(file_pattern.format('confusion_matrix', 'html'), 'w') as file: file.write(dataframe.to_html()) if string: with open(file_pattern.format('confusion_matrix', 'txt'), 'w') as file: file.write(dataframe.to_string()) if pickle: dataframe.to_pickle( file_pattern.format('confusion_matrix', 'pickle')) if stats_as_txt: with open(file_pattern.format('stats', '.txt'), 'w') as file: file.write(str(confusion_matrix.stats())) if latex: with open(file_pattern.format('confusion_matrix', '.latex_table'), 'w') as file: file.write(dataframe.to_latex()) def eval(self, feature=None, label=None): self.__eval_feature = feature self.__eval_label = label self.__classifier_predictions = self.__classifier.predict(feature) def train(self, feature=None, label=None, validation=None, epochs=None, batch_size=None, shuffle=True, store_history=True, early_stopping=None, save_every=1, callbacks=None): h = self.__classifier.fit(x=feature, y=label, validation_data=validation, batch_size=batch_size, epochs=epochs, shuffle=shuffle, callbacks=callbacks) if store_history: self.__history = h def eval_stats(self, reportpath): self.__stats(path=reportpath) @property def classifier(self): return self.__classifier @property def summary(self): return self.__summary
class Autoencoder: def __init__(self, encoder_layers, name='', hidden_layer_activation='relu', output_layer_activation='relu', loss_function='mse', optimizer=SGD(lr=0.01), discard_decoder_model=False): self.__name = name self.__encoder_layers_config = encoder_layers self.__discard_decoder_model = discard_decoder_model self.__hidden_layers_activation = hidden_layer_activation self.__output_layer_activation = output_layer_activation self.__loss_function = loss_function self.__optimizer = optimizer self.__trained = False self.__validated = False self.__generateModels() self.__compile() def __validateEncoderLayers(self): """ validate the encoder layers configured raise value errors if __encoder_layers_config was not setted or if list len is 'le' than one """ print('self.__encoder_layers_config ', self.__encoder_layers_config) if not self.__encoder_layers_config: raise ValueError( 'A list with the numbers of neurons in each layer is required.' ) if len(self.__encoder_layers_config) <= 1: raise ValueError( 'To generate an autoencoder you have to provide at least 2 layers (two items of a list).' ) def __generateEncoder(self, input): for id, neurons in enumerate(self.__encoder_layers_config[1:]): if id == 0: self.__encoder_layers = Dense( neurons, activation=self.__hidden_layers_activation, name='enc{}_{}'.format(id, neurons))(input) else: self.__encoder_layers = Dense( neurons, activation=self.__hidden_layers_activation, name='enc{}_{}'.format(id, neurons))(self.__encoder_layers) self.__encoder_model = Model(inputs=[input], outputs=[self.__encoder_layers]) def __generateDecoder(self): reversed_encoder_layers = self.__encoder_layers_config[:-1] for id, neurons in enumerate(reversed(reversed_encoder_layers)): if id == 0: self.__decoder_layers = Dense( neurons, activation=self.__hidden_layers_activation, name='dec{}_{}'.format(id, neurons))(self.__encoder_layers) else: decoder_activation = '' if id == len(self.__encoder_layers_config[:-1]) - 1: decoder_activation = self.__output_layer_activation else: decoder_activation = self.__hidden_layers_activation self.__decoder_layers = Dense( neurons, activation=decoder_activation, name='dec{}_{}'.format(id, neurons))(self.__decoder_layers) def __generateModels(self): self.__validateEncoderLayers() input = Input(shape=(self.__encoder_layers_config[0], )) self.__generateEncoder(input) self.__generateDecoder() self.__autoencoder = Model(inputs=[input], outputs=[self.__decoder_layers]) # if not self.__discard_decoder_model: # decoder_input = self.__encoder_model.layers[-1].output # # self.__decoder_model = Model(inputs=[decoder_input], outputs=[self.__autoencoder.layers[-1](decoder_input)]) def __compile(self): self.__autoencoder.compile(loss=self.__loss_function, optimizer=self.__optimizer, metrics=['accuracy']) self.__summary = self.__autoencoder.summary() def train_and_eval(self, feature=None, feature_validation=None, epochs=1000, batch_size=32, shuffle=True, store_history=True, callbacks=None): validation_data = None #if not feature_validation == None: validation_data = (feature_validation, feature_validation) h = self.__autoencoder.fit(x=feature, y=feature, validation_data=validation_data, shuffle=shuffle, epochs=epochs, batch_size=batch_size, callbacks=callbacks) if store_history: self.__history = h self.__trained = True self.__validated = True def __stats(self): file_pattern = 'reports/' + self.__name + '.{0}.{1}' classifier_predictions_max = np.argmax(self.__classifier_predictions, axis=1) Ymax = np.argmax(self.__eval_label, axis=1) self.__confusion_matrix = ConfusionMatrix(Ymax, classifier_predictions_max) self.__status_dump(file_pattern, self.__confusion_matrix, html=True, string=True, pickle=True, stats_as_txt=True, latex=True) def __status_dump(self, file_pattern, confusion_matrix, html=False, string=False, pickle=False, stats_as_txt=False, latex=False): dataframe = self.__confusion_matrix.to_dataframe() print('Report!') print('file_pattern ', file_pattern) if html: with open(file_pattern.format('confusion_matrix', '.html'), 'w') as file: file.write(dataframe.to_html()) if string: with open(file_pattern.format('confusion_matrix', '.txt'), 'w') as file: file.write(dataframe.to_string()) if pickle: dataframe.to_pickle( file_pattern.format('confusion_matrix', '.pickle')) if stats_as_txt: with open(file_pattern.format('stats', '.txt'), 'w') as file: file.write(str(confusion_matrix.stats())) if latex: with open(file_pattern.format('confusion_matrix', '.latex_table'), 'w') as file: file.write(dataframe.to_latex()) def get_classifier(self, activation=None, loss_function=None, optimizer=None, use_last_dim_as_classifier_dim=None, classifier_dim=None): if self.__trained and self.__validated: classifier = EncoderWithClassifier( self.__encoder_model, name=self.__name + '_classifier', activation=activation, loss_function=loss_function, optimizer=optimizer, use_last_dim_as_classifier_dim=use_last_dim_as_classifier_dim, classifier_dim=classifier_dim) return classifier else: logging.info( "impossible to create a classifier. Autoencoder isn't trained or validated!" ) return None def eval(self, feature=None): self.__eval_feature = feature self.__classifier_predictions = self.__classifier.predict(feature) def eval_stats(self): self.__stats() @property def encoder_model(self): return self.__encoder_model @property def decoder_model(self): return self.__decoder_model @property def autoencoder(self): return self.__autoencoder @property def training_history(self): return self.__history @property def summary(self): return self.__summary