示例#1
0
    def make_hist_top1(
        self, hist_pattern
    ):  # historgam (result). Can contain up to 2 placeholders {} for date/time for generation, #products:

        # Prepare data generators
        known_data_iterator = cm.get_data_iterator(self.known_data_folder,
                                                   self.target_size,
                                                   is_categorical=True)
        print("self.unknown_data_folder: {}".format(self.unknown_data_folder))
        unknown_data_iterator = cm.get_data_iterator(self.unknown_data_folder,
                                                     self.target_size,
                                                     is_categorical=False)

        # Get top 1 probabilities
        print("Getting predictions...")
        now = datetime.now()
        preds_known = cm.get_preds_top1(self.model, known_data_iterator)
        preds_unknown = cm.get_preds_top1(self.model, unknown_data_iterator)
        print("Predictions done in {} sec".format(
            (datetime.now() - now).total_seconds()))

        # Separate if=1
        eps = 1e-7
        preds_known = np.array([
            pred_known + 0.01 if pred_known > 1 - eps else pred_known
            for pred_known in preds_known
        ])
        preds_unknown = np.array([
            pred_unknown + 0.01 if pred_unknown > 1 - eps else pred_unknown
            for pred_unknown in preds_unknown
        ])

        plt.hist(preds_known,
                 200,
                 alpha=0.5,
                 label='known ({} samples)'.format(len(preds_known)))
        plt.hist(preds_unknown,
                 200,
                 alpha=0.5,
                 label='unknown ({} samples)'.format(len(preds_unknown)))

        plt.title("Top1 probability distribution for Known vs. Unknown")
        plt.legend(loc='upper right')
        # plt.show()
        hist_file = hist_pattern.format(
            datetime.now().strftime("%Y%m%d %H%M%S"),
            known_data_iterator.num_classes)
        plt.savefig(hist_file)

        print("Hist at: {}".format(hist_file))
示例#2
0
    def calc_save_prelast_activations(self, prelast_activations_file_name):
        known_data_iterator = cm.get_data_iterator(self.known_data_folder,
                                                   self.target_size,
                                                   is_categorical=True)
        unknown_data_iterator = cm.get_data_iterator(self.unknown_data_folder,
                                                     self.target_size,
                                                     is_categorical=False)

        (known_classes, _,
         known_activations) = cm.get_prelast_dense_activations(
             self.model, known_data_iterator, is_categorical=True)
        #(_,_,unknown_activations) = cm.get_prelast_dense_activations(self.model, unknown_data_iterator, is_categorical=False)

        act_file = open(prelast_activations_file_name, 'wb')
        #pickle.dump( (known_classes,known_activations, unknown_activations), act_file)
        pickle.dump((known_classes, known_activations), act_file)
        act_file.close()
        print("Results saved to file {}".format(prelast_activations_file_name))
        return
示例#3
0
    def calc_save_last_activations(self, last_activations_file_name):
        # make sure model is loaded
        self.__load_model()

        known_data_iterator = cm.get_data_iterator( self.known_data_folder, self.target_size, is_categorical=True)
        unknown_data_iterator = cm.get_data_iterator( self.unknown_data_folder, self.target_size, is_categorical=False)

        now = datetime.now()
        known_preds = cm.get_preds(self.model, known_data_iterator)
        print ("Got known predictions in {} sec".format((datetime.now()-now).total_seconds() ))

        now = datetime.now()
        unknown_preds = cm.get_preds(self.model, unknown_data_iterator)
        print ("Got unknown predictions in {} sec".format((datetime.now()-now).total_seconds() ))

        act_file = open(last_activations_file_name, 'wb')
        pickle.dump( (known_preds,unknown_preds), act_file)
        act_file.close()
        print("Results saved to file {}".format(last_activations_file_name))
        return
示例#4
0
    def __process_leaf_folder(self, meansigmas_dic, data_folder,
                              distances_file_name, is_categorical):

        data_iterator = cm.get_data_iterator(data_folder,
                                             self.target_size,
                                             is_categorical=is_categorical)

        (actual, top1, prelast_activations) = cm.get_prelast_dense_activations(
            self.model, data_iterator, is_categorical=is_categorical)

        i = 0
        for (sample_actual, sample_top1,
             sample_prelast_activations) in zip(actual, top1,
                                                prelast_activations):

            #Hypothetically, customer chooses each possible product
            for chosen_id in range(len(meansigmas_dic)):
                (chosen_mus, chosen_sigmas) = meansigmas_dic[chosen_id]
                #print ("top1_mus: {}, sample_prelast_activations: {}".format(top1_mus[:2], sample_prelast_activations[:2]))

                # Calculate euclidean distance and mahalandobis distance
                dist = np.sum(
                    np.square((sample_prelast_activations - chosen_mus)))
                # How many sigmas in each dimension varies from mean? (0 sigmas are added epsilon)
                dist_mahalanobis = np.sum(
                    np.square((sample_prelast_activations - chosen_mus) /
                              (chosen_sigmas + 1e-7)))
                # cosine distance
                dist_cosine = scipy.spatial.distance.cosine(
                    sample_prelast_activations, chosen_mus)

                # Result to file
                is_selected = chosen_id == sample_actual if is_categorical else 0
                df_distances = pd.DataFrame(data=[
                    np.hstack([
                        is_selected, sample_actual, dist, dist_mahalanobis,
                        dist_cosine
                    ])
                ])
                df_distances.to_csv(distances_file_name,
                                    header=None,
                                    index=None,
                                    mode='a')

            print("Processed {} files".format(i)) if i % 100 == 0 else 0
            i += 1
示例#5
0
    def __process_leaf_folder(self, meansigmas_dic, known_or_unknown,
                              data_folder, distances_file_name):

        data_iterator = cm.get_data_iterator(data_folder,
                                             self.target_size,
                                             is_categorical=False)

        (_, top1, prelast_activations) = cm.get_prelast_dense_activations(
            self.model, data_iterator, is_categorical=False)

        i = 0
        for (sample_top1,
             sample_prelast_activations) in zip(top1, prelast_activations):
            (top1_mus, top1_sigmas) = meansigmas_dic[sample_top1]
            #print ("top1_mus: {}, sample_prelast_activations: {}".format(top1_mus[:2], sample_prelast_activations[:2]))

            # Calculate euclidean distance and mahalandobis distance
            dist = np.sum(np.square((sample_prelast_activations - top1_mus)))
            # How many sigmas in each dimension varies from mean? (0 sigmas are added epsilon)
            dist_mahalanobis = np.sum(
                np.square((sample_prelast_activations - top1_mus) /
                          (top1_sigmas + 1e-7)))
            # cosine distance
            dist_cosine = scipy.spatial.distance.cosine(
                sample_prelast_activations, top1_mus)

            # Result to file
            df_distances = pd.DataFrame(data=[
                np.hstack([
                    known_or_unknown, sample_top1, dist, dist_mahalanobis,
                    dist_cosine
                ])
            ])
            df_distances.to_csv(distances_file_name,
                                header=None,
                                index=None,
                                mode='a')

            print("Processed {} files".format(i)) if i % 100 == 0 else 0
            i += 1
示例#6
0
    def make_roc_top1(
        self, roc_file_pattern
    ):  # ROC graph (result). Can contain up to 2 placeholders {} for date/time for generation, #products:
        # Prepare data generators
        known_data_iterator = cm.get_data_iterator(self.known_data_folder,
                                                   self.target_size,
                                                   is_categorical=True)
        unknown_data_iterator = cm.get_data_iterator(self.unknown_data_folder,
                                                     self.target_size,
                                                     is_categorical=False)

        # Get top 1 probabilities
        print("Getting predictions...")
        now = datetime.now()
        preds_known = cm.get_preds_top1(self.model, known_data_iterator)
        preds_unknown = cm.get_preds_top1(self.model, unknown_data_iterator)
        print("Predictions done in {} sec".format(
            (datetime.now() - now).total_seconds()))

        # Combine known and unknown to same vector
        y_pred = np.concatenate((preds_known, preds_unknown))
        y_true = np.concatenate(
            (np.ones(len(preds_known)), np.zeros(len(preds_unknown))))

        # Calculate ROC
        (fpr, tpr, thresholds) = roc_curve(y_score=y_pred, y_true=y_true)
        roc_auc = auc(fpr, tpr)

        # Find best accuracy
        accuracy_scores = []
        for thresh in thresholds:
            accuracy_scores.append(
                accuracy_score(y_true,
                               [1 if m > thresh else 0 for m in y_pred]))
        best_acc_ind = np.argmax(accuracy_scores)
        best_acc = accuracy_scores[best_acc_ind]
        threshhold_to_use = thresholds[best_acc_ind]
        print("Threshold to use = {}".format(threshhold_to_use))

        # Draw ROC
        plt.figure()
        plt.plot(fpr,
                 tpr,
                 color='green',
                 lw=2,
                 label='ROC AUC = %0.2f' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=0.5, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate'
                   )  #('Bandoma atpažinti %, kai nežinoma prekė')
        plt.ylabel(
            'True Positive Rate')  #('Bandoma atpažinti %, kai žinoma prekė')
        # cnt_class = len ( np.unique(df_distances["actual"]) ) - 1 #1-unknown class
        # samples_known = len(np.where (df_distances["actual"]!="")[0])
        # samples_unknown = len(np.where (df_distances["actual"]=="")[0])
        # plt.title('{} žinomos klasės; {} žinomų prekių, {} nežinomų'.format (cnt_class, samples_known, samples_unknown ) )
        plt.legend(loc="lower right")

        # Draw a point for best accuracy
        plt.plot(fpr[best_acc_ind], tpr[best_acc_ind], marker="s", color="red")
        plt.text(fpr[best_acc_ind] + 0.02, tpr[best_acc_ind] - 0.02,
                 "Best accuracy: {:.1f}%".format(best_acc * 100))

        # Save
        roc_file = roc_file_pattern.format(
            datetime.now().strftime("%Y%m%d %H%M%S"),
            known_data_iterator.num_classes)
        plt.savefig(roc_file)

        print("ROC at: {}".format(roc_file))
        return
示例#7
0
for version in Visible_versions:
    for hier in Hier_lvls:
        for the_set in Extract_sets:

            # Load model
            model_file_key = "v" + str(version) + "_Ind-" + str(hier)
            model_filename = os.path.join(models_path,
                                          tc.clsfs[model_file_key])
            model = load_model(model_filename)

            # data iterator
            set_data_folder = os.path.join(data_folder, "v" + str(version),
                                           "Ind-" + str(hier), the_set)
            data_iter = cm.get_data_iterator(data_folder=set_data_folder,
                                             target_size=target_size,
                                             is_categorical=True,
                                             is_resnet=is_resnet)

            # calc/save last activations
            last_activations_filename = last_activations_filepattern.format(
                version, hier, the_set)
            calc_save_last_activations(model, data_iter,
                                       last_activations_filename)

            # calc/save pre-last activations
            prelast_activations_filename = prelast_activations_filepattern.format(
                version, hier, the_set)
            calc_save_prelast_activations(model, data_iter,
                                          prelast_activations_filename)
示例#8
0
    def make_conf_mat(
        self,
        conf_mat_pattern,  # confusion matrix (result). Can contain up to 2 placeholders {} for date/time for generation, #products
        products_names_file  # NULLABLE; csv file w/o header of structure [name,barcode,...]
    ):

        # Prepare data generator
        data_iterator = cm.get_data_iterator(self.data_folder,
                                             self.target_size,
                                             is_categorical=True)

        # Predict highest classes
        (y_pred, y_true) = cm.get_pred_actual_classes(self.model,
                                                      data_iterator)

        # Get product names (folder names are barcodes)
        df_products = None
        if products_names_file is not None:
            df_products = pd.read_csv(products_names_file,
                                      header=None,
                                      dtype=str)

        # Replace barcodes with product names, if names passed
        prod_names = list(data_iterator.class_indices.keys())
        print("sample barcodes {} (tot: {})".format(prod_names[:2],
                                                    len(prod_names)))
        if df_products is not None:
            prod_names = [
                df_products.loc[df_products[1] == barcode, 0].values[0]
                for barcode in prod_names
            ]
            print("sample products {} (tot: {})".format(
                prod_names[:2], len(prod_names)))

            # Shorten to 15 characters
            prod_names = [prod[0:15] for prod in prod_names]
            #print (prods_short)

        # result confusion matrix file
        conf_mat_file = conf_mat_pattern.format(
            datetime.now().strftime("%Y%m%d %H%M%S"),
            data_iterator.num_classes)

        # When 0 images of certain labels, add 1 manually to avoid badly formatted conf mat
        for lbl in range(len(prod_names)):
            if lbl not in y_true:
                y_true = np.append(y_true, lbl)
                y_pred = np.append(y_pred, lbl)

        # Draw confusion matrix
        plt.figure(figsize=(int(len(prod_names) / 15),
                            int(len(prod_names)) / 15),
                   dpi=80)
        conf_mat = confusion_matrix(y_true=y_true, y_pred=y_pred)
        print("Shape: {}".format(conf_mat.shape))
        ax = sns.heatmap(conf_mat,
                         annot=True,
                         cbar=False,
                         annot_kws={'size': 5},
                         fmt='g')
        #for t in ax.texts: t.set_text(t.get_text() + " %")

        ax.set_xticks(np.arange(len(prod_names)) + 0.5)
        ax.set_yticks(np.arange(len(prod_names)) + 0.5)

        #prod_names = ["Product "+str(i) for i in range(len(prod_names))]
        ax.set_yticklabels(prod_names,
                           horizontalalignment='right',
                           rotation=0,
                           size=5)
        ax.set_xticklabels(prod_names,
                           horizontalalignment='right',
                           rotation=90,
                           size=5)

        ax.set_xlabel("PREDICTED", weight="bold")  #, size=20)
        ax.set_ylabel("ACTUAL", weight="bold")  #, size=20)
        plt.tight_layout()
        plt.savefig(conf_mat_file)
        plt.close()

        print("Conf mat at: {}".format(conf_mat_file))