Exemplo n.º 1
0
    def predict(self, texts, output_format='json', use_main_thread_only=False, batch_size=None):
        bert_data = False
        if self.transformer_name != None:
            bert_data = True

        if batch_size != None:
            self.model_config.batch_size = batch_size
            print("---")
            print("batch_size (prediction):", self.model_config.batch_size)
            print("---")

        if self.model_config.fold_number == 1:
            if self.model != None: 
                
                predict_generator = DataGenerator(texts, None, batch_size=self.model_config.batch_size, 
                    maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                    embeddings=self.embeddings, shuffle=False, bert_data=bert_data, transformer_tokenizer=self.model.transformer_tokenizer)

                result = self.model.predict(predict_generator, use_main_thread_only=use_main_thread_only)
            else:
                raise (OSError('Could not find a model.'))
        else:            
            if self.models != None: 

                # just a warning: n classifiers using BERT layer for prediction might be heavy in term of model sizes 
                predict_generator = DataGenerator(texts, None, batch_size=self.model_config.batch_size, 
                    maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                    embeddings=self.embeddings, shuffle=False, bert_data=bert_data, transformer_tokenizer=self.model.transformer_tokenizer)

                result = predict_folds(self.models, 
                                       predict_generator, 
                                       self.model_config, 
                                       self.training_config, 
                                       use_main_thread_only=use_main_thread_only)
            else:
                raise (OSError('Could not find nfolds models.'))
        if output_format == 'json':
            res = {
                "software": "DeLFT",
                "date": datetime.datetime.now().isoformat(),
                "model": self.model_config.model_name,
                "classifications": []
            }
            i = 0
            for text in texts:
                classification = {
                    "text": text
                }
                the_res = result[i]
                j = 0
                for cl in self.model_config.list_classes:
                    classification[cl] = float(the_res[j])
                    j += 1
                res["classifications"].append(classification)
                i += 1
            return res
        else:
            return result
Exemplo n.º 2
0
    def predict(self, texts, output_format='json', use_main_thread_only=False):
        if self.model_config.fold_number is 1:
            if self.model is not None:
                # bert model?
                if self.model_config.model_type.find("bert") != -1:
                    # be sure the input processor is instanciated
                    self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes)
                    result = self.model.predict(texts)
                else:
                    predict_generator = DataGenerator(texts, None, batch_size=self.model_config.batch_size, 
                        maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                        embeddings=self.embeddings, shuffle=False)

                    result = predict(self.model, predict_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
            else:
                raise (OSError('Could not find a model.'))
        else:            
            # bert model?
            if self.model_config.model_type.find("bert") != -1:
                # we don't support n classifiers for BERT for prediction currently 
                # (it would be too large and too slow if loaded 10 times from file for each batch)
                # (however it is done for eval, models are loaded 1 time for the complete dataset, not each time per batch, and we should do the same here) 
                # be sure the input processor is instanciated
                self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes)
                #result = self.models[0].predict(texts)
                result = self.model.predict(texts)
            else:
                if self.models is not None: 
                    predict_generator = DataGenerator(texts, None, batch_size=self.model_config.batch_size, 
                        maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                        embeddings=self.embeddings, shuffle=False)

                    result = predict_folds(self.models, predict_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
                else:
                    raise (OSError('Could not find nfolds models.'))
        if output_format is 'json':
            res = {
                "software": "DeLFT",
                "date": datetime.datetime.now().isoformat(),
                "model": self.model_config.model_name,
                "classifications": []
            }
            i = 0
            for text in texts:
                classification = {
                    "text": text
                }
                the_res = result[i]
                j = 0
                for cl in self.model_config.list_classes:
                    classification[cl] = float(the_res[j])
                    j += 1
                res["classifications"].append(classification)
                i += 1
            return res
        else:
            return result
Exemplo n.º 3
0
    def predict(self, texts, output_format='json'):
        if self.model_config.fold_number is 1:
            if self.model is not None:
                predict_generator = DataGenerator(
                    texts,
                    None,
                    batch_size=self.model_config.batch_size,
                    maxlen=self.model_config.maxlen,
                    list_classes=self.model_config.list_classes,
                    embeddings=self.embeddings,
                    shuffle=False)

                result = predict(self.model,
                                 predict_generator,
                                 use_ELMo=self.embeddings.use_ELMo,
                                 use_BERT=self.embeddings.use_BERT)
            else:
                raise (OSError('Could not find a model.'))
        else:
            if self.models is not None:
                predict_generator = DataGenerator(
                    texts,
                    None,
                    batch_size=self.model_config.batch_size,
                    maxlen=self.model_config.maxlen,
                    list_classes=self.model_config.list_classes,
                    embeddings=self.embeddings,
                    shuffle=False)

                result = predict_folds(self.models,
                                       predict_generator,
                                       use_ELMo=self.embeddings.use_ELMo,
                                       use_BERT=self.embeddings.use_BERT)
            else:
                raise (OSError('Could not find nfolds models.'))
        if output_format is 'json':
            res = {
                "software": "DeLFT",
                "date": datetime.datetime.now().isoformat(),
                "model": self.model_config.model_name,
                "classifications": []
            }
            i = 0
            for text in texts:
                classification = {"text": text}
                the_res = result[i]
                j = 0
                for cl in self.model_config.list_classes:
                    classification[cl] = float(the_res[j])
                    j += 1
                res["classifications"].append(classification)
                i += 1
            return res
        else:
            return result
Exemplo n.º 4
0
    def eval(self, x_test, y_test, use_main_thread_only=False):
        if self.model_config.fold_number == 1:
            if self.model is not None:
                # bert model?
                if self.model_config.model_type.find("bert") != -1:
                    #self.model.eval(x_test, y_test)
                    result = self.model.predict(x_test)
                else:
                    test_generator = DataGenerator(x_test, None, batch_size=self.model_config.batch_size, 
                        maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                        embeddings=self.embeddings, shuffle=False)

                    result = predict(self.model, test_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
            else:
                raise (OSError('Could not find a model.'))
        else:
            if self.models is not None or (self.model_config.model_type.find("bert") != -1 and self.model is not None):
                # bert model?
                print(self.model_config.model_type)
                if self.model_config.model_type.find("bert") != -1:
                    result_list = []
                    for i in range(self.model_config.fold_number):
                        result = self.model.predict(x_test, i)
                        result_list.append(result)

                    result = np.ones(result_list[0].shape)
                    for fold_result in result_list:
                        result *= fold_result

                    result **= (1. / len(result_list))
                else:
                    test_generator = DataGenerator(x_test, None, batch_size=self.model_config.batch_size, 
                        maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                        embeddings=self.embeddings, shuffle=False)
                    result = predict_folds(self.models, test_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
            else:
                raise (OSError('Could not find nfolds models.'))
        print("-----------------------------------------------")
        print("\nEvaluation on", x_test.shape[0], "instances:")

        total_accuracy = 0.0
        total_f1 = 0.0
        total_loss = 0.0
        total_roc_auc = 0.0

        '''
        def normer(t):
            if t < 0.5: 
                return 0 
            else: 
                return 1
        vfunc = np.vectorize(normer)
        result_binary = vfunc(result)
        '''
        result_intermediate = np.asarray([np.argmax(line) for line in result])
        
        def vectorize(index, size):
            result = np.zeros(size)
            if index < size:
                result[index] = 1
            return result
        result_binary = np.array([vectorize(xi, len(self.model_config.list_classes)) for xi in result_intermediate])

        precision, recall, fscore, support = precision_recall_fscore_support(y_test, result_binary, average=None)
        print('{:>14}  {:>12}  {:>12}  {:>12}  {:>12}'.format(" ", "precision", "recall", "f-score", "support"))
        p = 0
        for the_class in self.model_config.list_classes:
            the_class = the_class[:14]
            print('{:>14}  {:>12}  {:>12}  {:>12}  {:>12}'.format(the_class, "{:10.4f}"
                .format(precision[p]), "{:10.4f}".format(recall[p]), "{:10.4f}".format(fscore[p]), support[p]))
            p += 1

        # macro-average (average of class scores)
        # we distinguish 1-class and multiclass problems 
        if len(self.model_config.list_classes) is 1:
            total_accuracy = accuracy_score(y_test, result_binary)
            total_f1 = f1_score(y_test, result_binary)
            total_loss = log_loss(y_test, result, labels=[0,1])
            if len(np.unique(y_test)) == 1:
                # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches
                # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss)
                total_roc_auc = r2_score(y_test, result)
                if total_roc_auc < 0:
                    total_roc_auc = 0 
            else:
                total_roc_auc = roc_auc_score(y_test, result)
        else:
            for j in range(0, len(self.model_config.list_classes)):
                accuracy = accuracy_score(y_test[:, j], result_binary[:, j])
                total_accuracy += accuracy
                f1 = f1_score(y_test[:, j], result_binary[:, j], average='micro')
                total_f1 += f1
                loss = log_loss(y_test[:, j], result[:, j], labels=[0,1])
                total_loss += loss
                if len(np.unique(y_test[:, j])) == 1:
                    # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches
                    # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss)
                    roc_auc = r2_score(y_test[:, j], result[:, j])
                    if roc_auc < 0:
                        roc_auc = 0 
                else:
                    roc_auc = roc_auc_score(y_test[:, j], result[:, j])
                total_roc_auc += roc_auc
                '''
                print("\nClass:", self.model_config.list_classes[j])
                print("\taccuracy at 0.5 =", accuracy)
                print("\tf-1 at 0.5 =", f1)
                print("\tlog-loss =", loss)
                print("\troc auc =", roc_auc)
                '''

        total_accuracy /= len(self.model_config.list_classes)
        total_f1 /= len(self.model_config.list_classes)
        total_loss /= len(self.model_config.list_classes)
        total_roc_auc /= len(self.model_config.list_classes)

        '''
        if len(self.model_config.list_classes) is not 1:
            print("\nMacro-average:")
        print("\taverage accuracy at 0.5 =", "{:10.4f}".format(total_accuracy))
        print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1))
        print("\taverage log-loss =","{:10.4f}".format( total_loss))
        print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc))
        '''
        
        # micro-average (average of scores for each instance)
        # make sense only if we have more than 1 class, otherwise same as 
        # macro-avergae
        if len(self.model_config.list_classes) is not 1:
            total_accuracy = 0.0
            total_f1 = 0.0
            total_loss = 0.0
            total_roc_auc = 0.0

            for i in range(0, result.shape[0]):
                accuracy = accuracy_score(y_test[i,:], result_binary[i,:])
                total_accuracy += accuracy
                f1 = f1_score(y_test[i,:], result_binary[i,:], average='micro')
                total_f1 += f1
                loss = log_loss(y_test[i,:], result[i,:])
                total_loss += loss
                roc_auc = roc_auc_score(y_test[i,:], result[i,:])
                total_roc_auc += roc_auc

            total_accuracy /= result.shape[0]
            total_f1 /= result.shape[0]
            total_loss /= result.shape[0]
            total_roc_auc /= result.shape[0]

            '''
Exemplo n.º 5
0
    def eval(self, x_test, y_test):
        if self.model_config.fold_number is 1:
            if self.model is not None:
                test_generator = DataGenerator(
                    x_test,
                    None,
                    batch_size=self.model_config.batch_size,
                    maxlen=self.model_config.maxlen,
                    list_classes=self.model_config.list_classes,
                    embeddings=self.embeddings,
                    shuffle=False)

                result = predict(self.model,
                                 test_generator,
                                 use_ELMo=self.embeddings.use_ELMo,
                                 use_BERT=self.embeddings.use_BERT)
            else:
                raise (OSError('Could not find a model.'))
        else:
            if self.models is not None:
                test_generator = DataGenerator(
                    x_test,
                    None,
                    batch_size=self.model_config.batch_size,
                    maxlen=self.model_config.maxlen,
                    list_classes=self.model_config.list_classes,
                    embeddings=self.embeddings,
                    shuffle=False)

                result = predict_folds(self.models,
                                       test_generator,
                                       use_ELMo=self.embeddings.use_ELMo,
                                       use_BERT=self.embeddings.use_BERT)
            else:
                raise (OSError('Could not find nfolds models.'))
        print("-----------------------------------------------")
        print("\nEvaluation on", x_test.shape[0], "instances:")

        total_accuracy = 0.0
        total_f1 = 0.0
        total_loss = 0.0
        total_roc_auc = 0.0

        def normer(t):
            if t < 0.5:
                return 0
            else:
                return 1

        vfunc = np.vectorize(normer)
        result_binary = vfunc(result)

        # macro-average (average of class scores)
        # we distinguish 1-class and multiclass problems
        if len(self.model_config.list_classes) is 1:
            total_accuracy = accuracy_score(y_test, result_binary)
            total_f1 = f1_score(y_test, result_binary)
            total_loss = log_loss(y_test, result)
            total_roc_auc = roc_auc_score(y_test, result)
        else:
            for j in range(0, len(self.model_config.list_classes)):
                accuracy = accuracy_score(y_test[:, j], result_binary[:, j])
                total_accuracy += accuracy
                f1 = f1_score(y_test[:, j],
                              result_binary[:, j],
                              average='micro')
                total_f1 += f1
                loss = log_loss(y_test[:, j], result[:, j])
                total_loss += loss
                roc_auc = roc_auc_score(y_test[:, j], result[:, j])
                total_roc_auc += roc_auc
                print("\nClass:", self.model_config.list_classes[j])
                print("\taccuracy at 0.5 =", accuracy)
                print("\tf-1 at 0.5 =", f1)
                print("\tlog-loss =", loss)
                print("\troc auc =", roc_auc)

        total_accuracy /= len(self.model_config.list_classes)
        total_f1 /= len(self.model_config.list_classes)
        total_loss /= len(self.model_config.list_classes)
        total_roc_auc /= len(self.model_config.list_classes)

        if len(self.model_config.list_classes) is not 1:
            print("\nMacro-average:")
        print("\taverage accuracy at 0.5 =", "{:10.4f}".format(total_accuracy))
        print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1))
        print("\taverage log-loss =", "{:10.4f}".format(total_loss))
        print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc))

        # micro-average (average of scores for each instance)
        # make sense only if we have more than 1 class, otherwise same as
        # macro-avergae
        if len(self.model_config.list_classes) is not 1:
            total_accuracy = 0.0
            total_f1 = 0.0
            total_loss = 0.0
            total_roc_auc = 0.0

            for i in range(0, result.shape[0]):
                #for j in range(0, len(self.model_config.list_classes)):
                accuracy = accuracy_score(y_test[i, :], result_binary[i, :])
                total_accuracy += accuracy
                f1 = f1_score(y_test[i, :],
                              result_binary[i, :],
                              average='micro')
                total_f1 += f1
                loss = log_loss(y_test[i, :], result[i, :])
                total_loss += loss
                roc_auc = roc_auc_score(y_test[i, :], result[i, :])
                total_roc_auc += roc_auc

            total_accuracy /= result.shape[0]
            total_f1 /= result.shape[0]
            total_loss /= result.shape[0]
            total_roc_auc /= result.shape[0]

            print("\nMicro-average:")
            print("\taverage accuracy at 0.5 =",
                  "{:10.4f}".format(total_accuracy))
            print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1))
            print("\taverage log-loss =", "{:10.4f}".format(total_loss))
            print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc))