示例#1
0
    def _train_model(self,
                     model: Sequential = None,
                     train_dataset: Dataset = None,
                     validation_dataset: Dataset = None,
                     n_epochs: int = None):
        model = model or self.model
        train_dataset = train_dataset or self.train_dataset
        validation_dataset = validation_dataset or self.validation_dataset
        n_epochs = n_epochs or self.config.NUM_EPOCHS
        validate_variables(model, train_dataset, validation_dataset, n_epochs)

        if isinstance(model, TFDistilBertForSequenceClassification):
            model.fit(train_dataset.batch(self.config.BATCH_SIZE),
                      validation_data=validation_dataset.batch(
                          self.config.BATCH_SIZE),
                      epochs=n_epochs)
        else:
            model.fit(train_dataset,
                      validation_data=validation_dataset,
                      epochs=n_epochs)

        self.model = model

        self.logger.info("Successfully trained tf model")
        return model
示例#2
0
    def get_modeling_pipeline(self,
                              img_size: Tuple[int, int] = None,
                              learning_rate: float = None,
                              metrics: List[str] = None,
                              n_epochs: int = None,
                              train_dataset: Dataset = None,
                              validation_dataset: Dataset = None):
        img_size = img_size or self.img_size
        learning_rate = learning_rate or self.config.LEARNING_RATE
        metrics = metrics or self.config.METRICS
        n_epochs = n_epochs or self.config.NUM_EPOCHS
        train_dataset = train_dataset or self.train_dataset
        validation_dataset = validation_dataset or self.validation_dataset

        processing_pipeline = self.get_processing_pipeline()
        model = self.get_model()

        validate_variables(img_size, learning_rate, metrics, n_epochs,
                           processing_pipeline, model)

        modeling_pipeline = tf.keras.Sequential([
            tf.keras.Input(shape=img_size + (3, )), processing_pipeline, model,
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(len(self.config.DISEASES), "softmax")
        ])

        # TODO: Add weighted Adam
        model = self._compile_model(modeling_pipeline, learning_rate, metrics)
        modeling_pipeline = self._train_model(model, train_dataset,
                                              validation_dataset, n_epochs)

        self.modeling_pipeline = modeling_pipeline
        self.logger.info("Successfully loaded modeling pipeline")
        return modeling_pipeline
示例#3
0
    def get_best_modeling_pipeline_type(self,
                                        transformer_modeling_pipeline: TransformersModelingPipeline = None,
                                        sklearn_modeling_pipeline: Pipeline = None,
                                        x_test: DataFrame = None,
                                        y_test: Series = None,
                                        test_dataset: Dataset = None)\
            -> Union[Pipeline, TransformersModelingPipeline]:
        x_test, y_test = self._set_dfs_test(x_test, y_test)
        test_dataset = test_dataset or self.test_dataset
        validate_variables(x_test, y_test, test_dataset)

        transformer_results_metric = (transformer_modeling_pipeline.evaluate(
            test_dataset, batch_size=self.config.BATCH_SIZE))[1]

        # TODO: Fix this to use specified metric
        sklearn_predictions = sklearn_modeling_pipeline.predict(x_test)
        sklearn_results_metric = self.scoring_function(y_test,
                                                       sklearn_predictions)

        if transformer_results_metric > sklearn_results_metric:
            modeling_pipeline = transformer_modeling_pipeline
        else:
            modeling_pipeline = sklearn_modeling_pipeline

        self.modeling_pipeline = modeling_pipeline

        self.logger.info("Successfully found best modeling pipeline type")
        return modeling_pipeline
    def _load_dataset(self,
                      batch_size: int = None,
                      data_path: Path = None) -> Tuple[Dataset, Dataset]:
        """
        Utility function for loading tensorflow dataset from provided directory. The function splits train and validation
        :param batch_size: A batch size for the datasets
        :param data_path: Path to data directory. Files should be organized for tensorflow's text_dataset_from_directory
        :return: Returns a tuple with train and validation datasets
        """
        batch_size = batch_size or self.config.BATCH_SIZE
        data_path = data_path or self.config.DATA_PATH
        validate_variables(batch_size, data_path)

        train_dataset = tf.keras.preprocessing.text_dataset_from_directory(
            directory=data_path,
            validation_split=self.config.TEST_SIZE,
            batch_size=batch_size,
            subset="training",
            seed=self.config.SEED,
            shuffle=True)

        validation_dataset = tf.keras.preprocessing.text_dataset_from_directory(
            directory=data_path,
            validation_split=self.config.TEST_SIZE,
            batch_size=batch_size,
            subset="validation",
            seed=self.config.SEED,
            shuffle=True)

        self.train_dataset = validation_dataset
        self.validation_dataset = validation_dataset

        self.logger.info(f"Successfully loaded train and validation datasets ")
        return train_dataset, validation_dataset
    def _split_train_test_structured(self,
                                     x: DataFrame = None, y: Series = None,
                                     test_size: float = 0.2, random_state: int = 42)\
            -> Tuple[DataFrame, DataFrame, Series, Series]:
        if isinstance(x, pd.DataFrame):
            x = x
        elif x is None:
            x = self.x
        if isinstance(y, pd.DataFrame):
            y = y
        elif y is None:
            y = self.y
        test_size = test_size or self.config.TEST_SIZE
        random_state = random_state or self.config.SEED
        validate_variables(x, y, test_size, random_state)

        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=test_size, random_state=random_state)

        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

        self.logger.info("Successfully splat train and test data")
        return x_train, x_test, y_train, y_test
    def _get_avg_img_size(self, path: Path = None) -> Tuple[int, int]:
        """
        Utility function to get average image size from your data, necessary to choose proper EfficientNet version
        :param path: Path to data files
        :return: Returns a tuple with mean image size from provided image data
        """
        path = path or self.config.DATA_PATH
        validate_variables(path)

        height_list = []
        width_list = []
        for subclass_dir in path.iterdir():
            for img_path in subclass_dir.iterdir():
                img = cv2.imread(str(img_path))
                height, width, _ = img.shape
                height_list.append(height)
                width_list.append(width)
        mean_height = int(sum(height_list) / len(height_list))
        mean_width = int(sum(width_list) / len(width_list))

        self.img_size = (mean_height, mean_width)

        self.logger.info(
            f"Mean height is: {mean_height}, mean width is: {mean_width}")
        return self.img_size
    def _get_efficientnet_and_size(
        self,
        img_size: Tuple[int,
                        int] = None) -> Tuple[Tuple[int, int], Sequential]:
        """
        Utility function to get proper type of EfficientNet, the version is chosen based on the mean image size
        More on: https://keras.io/examples/vision/image_classification_efficientnet_fine_tuning/
        :param img_size: Mean input size of image files
        :return: Returns a tuple of image_size after changing to right static value and EfficientNet object
        """
        img_size = img_size or self.img_size
        validate_variables(img_size)

        img_size = (img_size[0] + img_size[1]) / 2

        if 564 < img_size:
            img_size = (600, 600)
            model = tf.keras.applications.EfficientNetB7
        elif 492 < img_size <= 564:
            img_size = (528, 528)
            model = tf.keras.applications.EfficientNetB6
        else:
            img_size = (456, 456)
            model = tf.keras.applications.EfficientNetB5

        self.img_size = img_size
        self.model = model

        self.logger.info(f"Chosen model is {model} with img_size {img_size}")
        return img_size, model
示例#8
0
    def _make_sklearn_prediction(self, data: dict) -> Tuple[np.ndarray, str]:
        """
        Utility function for making predictions with sklearn
        :param data: Data to make predictions on
        :return: Returns a tuple of class probabilities and str with class prediction
        """
        modeling_pipeline = self.modeling_pipeline or self.load_pipeline(
            self.path)
        validate_variables(modeling_pipeline, data)

        prediction = modeling_pipeline.predict_proba(data)[0]
        max_pred_idx = np.argmax(prediction)
        prediction_proba = prediction[max_pred_idx]

        try:
            map_ = self.config.LABEL_MAPPING
        except AttributeError:
            map_ = self.config.DISEASES
        prediction_string = map_[max_pred_idx]

        self.logger.info(
            f"Made predictions with model version: {dermclass_models_version} "
            f"Inputs: {data} "
            f"Prediction: {prediction_string} "
            f"Probability: {prediction_proba}")
        return prediction_proba, prediction_string
示例#9
0
    def _hyper_param_optimization(trial, model_name: str, trial_func: Trial,
                                  max_overfit: float, cv: int,
                                  x_train: DataFrame, x_test: DataFrame,
                                  y_train: Series, y_test: Series):
        validate_variables(trial, model_name, trial_func, x_train, x_test,
                           y_train, y_test)

        model_obj = eval(model_name)
        cv_score = np.mean(
            cross_val_score(model_obj(**trial_func(trial)),
                            x_train,
                            y_train,
                            scoring="accuracy",
                            cv=cv))

        model = model_obj(**trial_func(trial))
        model.fit(x_train, y_train)

        train_score = accuracy_score(y_train, model.predict(x_train))
        test_score = accuracy_score(y_test, model.predict(x_test))
        if abs(train_score - test_score) > max_overfit:
            output = 0
        else:
            output = cv_score
        return output
示例#10
0
 def __init__(self, config: StructuredConfig = StructuredConfig):
     """
     A class for preprocessing structured data
     :param config: Config object for the class
     """
     validate_variables(config)
     super().__init__(config)
示例#11
0
    def _make_tf_prediction(self,
                            data: np.array,
                            diseases=List[str]) -> Tuple[np.array, str]:
        """
        Utility function for making predictions with tensorflow
        :param data: Data to make predictions in numpy ndarray format
        :param diseases: A list of diseases names in proper order
        :return: Returns a tuple of class probabilities and str with class prediction
        """
        modeling_pipeline = self.modeling_pipeline or self.load_pipeline(
            self.path)
        diseases = diseases or self.config.DISEASES
        validate_variables(modeling_pipeline, data, diseases)

        prediction = modeling_pipeline.predict(data)[0]
        max_pred_idx = np.argmax(prediction)
        prediction_proba = prediction[max_pred_idx]

        try:
            map_ = self.config.LABEL_MAPPING
        except AttributeError:
            map_ = self.config.DISEASES
        prediction_string = map_[max_pred_idx]

        self.logger.info(
            f"Made predictions with model version: {dermclass_models_version} "
            f"Inputs: {data} "
            f"Prediction: {prediction_string} "
            f"Probability: {prediction_proba}")
        return prediction_proba, prediction_string
示例#12
0
    def save_pipeline(self,
                      pipeline_object: Union[TransformersModelingPipeline,
                                             SklearnPipeline, Sequential],
                      backend: str = None,
                      path: Path = None):
        """
        A function for saving pipeline using provided backend to given path
        :param pipeline_object: A pipeline object to save
        :param backend: Type of backend used for saving given pipeline, has to be one of ["joblib", "tf", "tfm"]
        :param path: Path to save file or directory
        """
        if backend not in ["joblib", "tf", "tfm"]:
            raise ValidationError(
                "Please choose proper backend from ['joblib', 'tf', 'tfm']")
        path = path or self.config.PICKLE_DIR / f"{self.config.PIPELINE_TYPE}_{self.pipeline_version}"
        validate_variables(pipeline_object, backend, path)

        self.remove_old_pipelines()

        if backend == "joblib":
            joblib.dump(pipeline_object, str(path) + ".joblib")
        if backend == "tf":
            pipeline_object.save(path)
        if backend == "tfm":
            pipeline_object.processing_pipeline.tokenizer.save_pretrained(path)
            pipeline_object.model.save_pretrained(path)

        self.logger.info(
            f"Saved pipeline {str(pipeline_object)}, to path {path}")
示例#13
0
    def load_pipeline(self, backend: str = None, path: Path = None)\
            -> Union[TransformersModelingPipeline, SklearnPipeline, Sequential]:
        """
        Function for loading pipeline from given path using provided backend. Can be used either with set params or
        params from the config
        :param backend: Type of backend used for loading given pipeline, has to be one of ["joblib", "tf", "tfm"]
        :param path: Path to loaded file or directory
        :return: Returns a pipeline for making predictions
        """
        if backend not in ["joblib", "tf", "tfm"]:
            raise ValidationError(
                "Please choose proper backend from ['joblib', 'tf', 'tfm']")
        path = path or self.config.PICKLE_DIR / f"{self.config.PIPELINE_TYPE}_{self.pipeline_version}"
        validate_variables(backend, path)
        if backend == "joblib":
            pipeline = joblib.load(str(path) + '.joblib')
        elif backend == "tf":
            pipeline = load_model(path)
        elif backend == "tfm":
            pipeline = TransformersModelingPipeline.load_from_pretrained(path)
        else:
            pipeline = None

        self.logger.info(f"{path.name} loaded")
        return pipeline
示例#14
0
    def set_img_size_and_model_obj(self, img_size: Tuple[int, int],
                                   model_obj: Sequential):
        validate_variables(img_size, model_obj)

        self.img_size = img_size
        self.model_obj = model_obj
        self.logger.info("Successfully set img size and model obj")
示例#15
0
    def fit_datasets(self, train_dataset: Dataset, validation_dataset: Dataset,
                     test_dataset: Dataset):
        validate_variables(train_dataset, validation_dataset, test_dataset)

        self.train_dataset = train_dataset
        self.validation_dataset = validation_dataset
        self.test_dataset = test_dataset
        validate_variables(train_dataset, validation_dataset, test_dataset)
def test_validate_variables(structured_training_df):
    args_with_none = ["test", 1, None]
    args_pd = ["test2", 2, structured_training_df]

    with pytest.raises(TypeError):
        validate_variables(*args_with_none)
    # Check if doesn't raise an error
    validate_variables(*args_pd)
示例#17
0
    def load_from_pretrained(cls, path: Path):
        model = TFDistilBertForSequenceClassification.from_pretrained(path)
        tokenizer = DistilBertTokenizerFast.from_pretrained(path)
        processing_pipeline = TransformersProcessingPipeline(
            TextPipeline.encode_dataset, tokenizer)
        validate_variables(model, tokenizer, processing_pipeline)

        return cls(model=model, processing_pipeline=processing_pipeline)
示例#18
0
    def __init__(self, config):
        """
        Class for saving and loading pipeline objects.
        :param config: Config object for the class
        """
        validate_variables(config)

        self.config = config
        self.pipeline_version = _version
        self.logger = logging.getLogger(__name__)
示例#19
0
    def __init__(self, config):
        """
        Abstract base class for training pipeline and saving it
        :param config: Config object for the class
        """
        validate_variables(config)

        self.config = config
        self.logger = logging.getLogger(__name__)
        self.modeling_pipeline = None
示例#20
0
    def make_prediction(self, input_data: dict) -> Tuple[np.ndarray, str]:
        """
        Function to make prediction on given data
        :param input_data: Input data to make prediction on
        :return: Returns a tuple of class probabilities and str with class prediction
        """
        validate_variables(input_data)

        data = self._prepare_data(input_data)
        prediction_probabilities, prediction_string = self._make_sklearn_prediction(
            data)
        return prediction_probabilities, prediction_string
示例#21
0
    def _load_structured_data(self, path: Path = None) -> DataFrame:
        """
        Utility function to loaod structured data from the csv
        :param path: Path to data file
        :return: Returns a pandas DataFrame with data loaded
        """
        path = path or self.config.DATA_PATH
        validate_variables(path)

        df = pd.read_csv(path)
        self.df = df
        self.logger.info("Successfully loaded data from csv")
        return df
示例#22
0
    def _prepare_data(self, input_data: dict) -> DataFrame:
        """
        Utility function to prepare data to format and validate data which can be used in modeling pipeline
        :param input_data: Input data to make prediction on
        :return: Returns a pandas DataFrame with data ready for making predictions using modeling pipeline
        """
        validate_variables(input_data)
        if not self.validator:
            raise RuntimeError("No validator object fitted")

        df = pd.DataFrame(input_data, index=[0])
        df_validated = self.validator.validate(df)
        return df_validated
示例#23
0
    def __init__(self, config):
        """
        Abstract base class used for making prediction
        :param config: Config object for the class
        """
        validate_variables(config)

        self.config = config
        self.logger = logging.getLogger(__name__)
        self.persister = BasePersistence(config)
        self.modeling_pipeline = None
        self.backend = None
        self.path = None
示例#24
0
    def __init__(self, config):
        """
        An abstract class for  for preprocessing data with tensorflow
        :param config: Config object for the class
        """
        validate_variables(config)
        self.config = config
        self.logger = logging.getLogger(__name__)

        self.train_dataset = Dataset
        self.validation_dataset = Dataset
        self.test_dataset = Dataset

        self.prefetch = False
示例#25
0
    def load_data(
            self,
            path: Path = None) -> Tuple[DataFrame, DataFrame, Series, Series]:
        """
        Function to load structured data using sklearn
        :param path: Path to data directory
        :return: Returns a tuple with x_train, x_test, y_train, y_test data
        """
        path = path or self.config.DATA_PATH
        validate_variables(path)

        df = self._load_structured_data(path)
        x_train, x_test, y_train, y_test = self._load_data_structured(df)
        return x_train, x_test, y_train, y_test
示例#26
0
    def get_model(self,
                  x_train: DataFrame = None,
                  x_test: DataFrame = None,
                  y_train: Series = None,
                  y_test: Series = None):
        x_train, x_test, y_train, y_test = self._set_dfs(
            x_train, x_test, y_train, y_test)
        validate_variables(x_train, x_test, y_train, y_test)

        model = self._get_sklearn_model(x_train, x_test, y_train, y_test)
        self.model = model

        self.logger.info("Successfully loaded structured model")
        return model
示例#27
0
    def load_pipeline(self, backend: str = None, path: Path = None):
        """Function to load pipeline using persister and fit it as a modeling pipeline
        :param backend: Type of backend used for loading given pipeline, has to be one of ["joblib", "tf", "tfm"]
        :param path: Path to loaded file or directory
        :return: Returns a modeling pipeline to make predictions with
        """
        backend = backend or self.backend
        validate_variables(backend)
        if not self.persister:
            raise RuntimeError("No preprocessor object fitted")

        modeling_pipeline = self.persister.load_pipeline(backend=backend,
                                                         path=path)
        self.modeling_pipeline = modeling_pipeline
        return modeling_pipeline
示例#28
0
    def get_model(self, model_obj=None):
        model_obj = model_obj or self.model_obj
        validate_variables(model_obj)

        model = model_obj(include_top=False,
                          weights='imagenet',
                          classes=len(self.config.DISEASES))
        model.trainable = False

        self.model = model

        self.logger.warning(
            "Warning! get_model function in ImagePipeline returns unfitted model"
        )
        return model
示例#29
0
    def _prepare_data(self, input_data: dict,
                      img_shape: Tuple[int, int]) -> np.array:
        """
        Utility function to prepare data to format which can be used in modeling pipeline
        :param input_data: Input data to make prediction on
        :param img_shape: Shape of image to resize data
        :return: An array with data ready for making predictions using modeling pipeline
        """
        img_shape = img_shape or self.img_shape
        validate_variables(input_data, img_shape)

        data = input_data["img_array"]
        data = np.resize(data, img_shape)
        data = np.expand_dims(data, 0)

        return data
示例#30
0
    def _get_img_shape(self, modeling_pipeline: Sequential) -> Tuple[int, int]:
        """
        Utility function to get image shape, necessary for resizing input data
        :param modeling_pipeline: A tensorflow model object to get image shape from
        :return: A tuple with image shape
        """
        validate_variables(modeling_pipeline)
        if modeling_pipeline.layers[1].name == "efficientnetb7":
            img_size = (600, 600, 3)
        elif modeling_pipeline.layers[1].name == "efficientnetb6":
            img_size = (528, 528, 3)
        else:
            img_size = (456, 456, 3)

        self.img_size = img_size
        return img_size