예제 #1
0
class PropertyCheckerLogger(PropertyChecker):
    def __init__(self):
        super().__init__()
        self.log = Logger().logger

    def _talk_to_logger(self, error_message):
        self.log.critical(error_message)
예제 #2
0
파일: manual.py 프로젝트: SietsmaRJ/capice
class ManualAnnotator:
    def __init__(self):
        self.log = Logger().logger
        self.vep_annotators = []
        self.location = os.path.join(get_project_root_dir(),
                                     'src',
                                     'main',
                                     'python',
                                     'resources',
                                     'annotaters',
                                     'vep')
        self._load_vep_annotators()

    def _load_vep_annotators(self):
        python_modules = load_modules(self.location)
        self._check_n_modules(python_modules)
        loaded_python_modules = importer(python_modules, path=self.location)
        self._check_n_modules(loaded_python_modules)
        for module in loaded_python_modules:
            if 'name' in dir(module) and module.usable:
                self.vep_annotators.append(module)
        self._check_n_modules(self.vep_annotators)

    def _check_n_modules(self, modules_list):
        if len(modules_list) < 1:
            error_message = 'Unable to locate VEP Processors at {}, ' \
                            'was the directory moved?'.format(self.location)
            self.log.critical(error_message)
            raise FileNotFoundError(error_message)

    def process(self, dataset: pd.DataFrame):
        for processor in self.vep_annotators:
            if processor.name in dataset.columns and processor.usable:
                dataset = processor.process(dataset)
                if processor.drop:
                    dataset.drop(columns=processor.name, inplace=True)
            else:
                self.log.warning(
                    'Could not use processor {} on input dataset!'.format(
                        processor.name
                    )
                )
        return dataset
예제 #3
0
class TrainChecker:
    """
    Class specific to the train_model.py to check certain parts within it's
    process.
    """
    def __init__(self):
        self.log = Logger().logger

    def check_specified_defaults(self, loaded_defaults: dict):
        """
        Function to check if
            "learning_rate",
            "n_estimators" and
            "max_depth"
        are present within the specified defaults file.
        Also check if the variable type match the expected type.
        :param loaded_defaults: dict
        """
        required_arguments = {
            'learning_rate': float,
            'n_estimators': int,
            'max_depth': int
        }
        for argument in required_arguments.keys():
            if argument not in loaded_defaults.keys():
                error_message = 'Argument {} is not found in the ' \
                                'specified defaults file!'.format(argument)
                self.log.critical(error_message)
                raise KeyError(error_message)
            if not isinstance(loaded_defaults[argument],
                              required_arguments[argument]):
                error_message = """
                For argument expected type: {}, but got: {}
                """.format(required_arguments[argument],
                           type(loaded_defaults[argument]))
                self.log.critical(error_message)
                raise TypeError(error_message)

    def check_labels(self, dataset: pd.DataFrame, include_balancing=False):
        """
        Function to check if
            "binarized_label" and
            "sample_weight"
        are present within the columns of a given dataset.
        Set include_balancing to true if user wants to perform balancing
        algorithm.
        :param dataset: pandas DataFrame
        :param include_balancing: bool
        """
        required_columns = ['binarized_label', 'sample_weight']
        if include_balancing:
            required_columns += ['Consequence', 'MAX_AF']
        for col_name in required_columns:
            if col_name not in dataset.columns:
                error_message = """
                Error locating label {} within dataset!
                """.format(col_name)
                self.log.critical(error_message)
                raise KeyError(error_message)
예제 #4
0
class TemplateImputeValues(metaclass=ABCMeta):
    """
    Abstract template class for new imputing files.
    """
    def __init__(self, name, usable, vep_version, grch_build):
        self.log = Logger().logger
        self.property_checker = PropertyCheckerLogger()
        self.name = name
        self.usable = usable
        self.supported_vep_version = vep_version
        self.supported_grch_build = grch_build
        self.impute_data = self._get_impute_data()

    @property
    def name(self):
        return self._name

    @name.setter
    def name(self, value='Template'):
        self.property_checker.check_property(value=value, expected_type=str)
        self._name = value

    @property
    def usable(self):
        return self._usable

    @usable.setter
    def usable(self, value=False):
        self.property_checker.check_property(value=value, expected_type=bool)
        self._usable = value

    @property
    def supported_vep_version(self):
        return self._vep_version

    @supported_vep_version.setter
    def supported_vep_version(self, value):
        self.property_checker.check_property(value=value, expected_type=float)
        self._vep_version = value

    @property
    def supported_grch_build(self):
        return self._grch_build

    @supported_grch_build.setter
    def supported_grch_build(self, value):
        self.property_checker.check_property(value=value, expected_type=int)
        self._grch_build = value

    def _get_impute_data(self):
        with open(self._json_loc()) as json_file:
            json_data = json.load(json_file)
        return json_data

    def _json_loc(self):
        path = os.path.join(get_project_root_dir(), 'src', 'main', 'python',
                            'resources', 'data_files', 'json_data')
        json_name = self._json_name()
        if json_name == 'none':
            error_message = 'Location of JSON must be specified!'
            self.log.critical(error_message)
            raise FileNotFoundError(error_message)
        return os.path.join(path, json_name)

    @staticmethod
    @abstractmethod
    def _json_name():
        """
        Abstract template function to define the location of where the imputing
        JSON is stored, containing the required columns for the input datafile.
        :return: path-like
        """
        return 'none'

    @property
    def annotation_features(self):
        """
        Property getter annotation_feature.
        Get the annotation features defined within the impute file.

        :return: list
        """
        return list(self.impute_data.keys())

    @property
    def impute_values(self):
        """
        Property impute_values getter. Get the default / impute values as
        defined within an impute file.

        :return: dict
        """
        return_dict = {}
        for key, value in self.impute_data.items():
            if value is not None:
                return_dict[key] = value
        return return_dict
예제 #5
0
class PreProcessor:
    """
    Class to dynamically load in all model files for preprocessing and choosing
    the correct preprocessing file according to the given config arguments or
    parsed VEP file header. (or the --overwrite_model_file argument)
    """
    def __init__(self, is_train: bool = False):
        self.manager = CapiceManager()
        self.log = Logger().logger
        self.log.info('Preprocessor started.')
        self.overrule = self.manager.overwrite_model
        self.vep_version = self.manager.vep_version
        self.grch_build = self.manager.grch_build
        self.train = is_train
        self.preprocessors = []
        self.preprocessor = None
        self._prepare_preprocessor()

    def _prepare_preprocessor(self):
        """
        Function to see if the training protocol should be used or the
        preprocessors should be loaded in.
        """
        if self.train:
            from src.main.python.resources.models.training_preprocessor import \
                TrainPreprocessor
            self.preprocessor = TrainPreprocessor()
        else:
            self._load_preprocessors()
            self._load_correct_preprocessor()
        self._check_preprocessor_is_applied()

    def _load_preprocessors(self):
        """
        Function to dynamically load in the preprocessors modules,
        but must have the following properties:
            name,
            supported_vep_version and
            supported_genomebuild_version.
        """
        self.log.info('Identifying preprocessing files.')
        directory = os.path.join(get_project_root_dir(),
                                 'src',
                                 'main',
                                 'python',
                                 'resources',
                                 'models')
        usable_modules = load_modules(directory)
        if len(usable_modules) < 1:
            self._raise_no_module_found_error()
        imported_modules = importer(
            usable_modules=usable_modules,
            path=directory
        )
        for module in imported_modules:
            if "name" in dir(module) and "supported_vep_version" in dir(
                    module) and "supported_grch_build" in dir(module):
                self.preprocessors.append(module)
        if len(self.preprocessors) < 1:
            self._raise_no_module_found_error()
        self.log.info(
            'Succesfully loaded {} preprocessors.'.format(
                len(self.preprocessors)
            )
        )

    def _raise_no_module_found_error(self):
        """
        Specialized function to be used into
            _load_preprocessors() and
            _load_correct_preprocessor()
        to be raised when no preprocessing files can be found.
        """
        error_message = 'No usable python files are ' \
                        'found within the model directory!'
        self.log.critical(error_message)
        raise FileNotFoundError(error_message)

    def _load_correct_preprocessor(self):
        """
        Function to check the dynamically loaded preprocessors to match either
        the overrule argument or the vep version and genome build.
        """
        for preprocessor in self.preprocessors:
            if self.overrule and preprocessor.name == self.overrule:
                self.log.info(
                    'Overrule successful for: {} , located at: {}'.format(
                        self.overrule,
                        inspect.getfile(preprocessor.__class__)
                    )
                )
                self.preprocessor = preprocessor
                break
            else:
                module_vep = preprocessor.supported_vep_version
                module_grch = preprocessor.supported_grch_build
                if module_vep == self.vep_version and \
                        module_grch == self.grch_build:
                    self.log.info("""
                    Preprocessing and model file successfully found: {} , 
                    Located at: {}
                    """.format(
                        preprocessor.name,
                        inspect.getfile(preprocessor.__class__)
                    ).strip()
                    )
                    self.preprocessor = preprocessor
                    break

    def _check_preprocessor_is_applied(self):
        if self.preprocessor is None:
            if self.overrule:
                error_message = 'No model data file found for overrule: ' \
                                '{}'.format(self.overrule)
            else:
                error_message = """
                No model data file found for 
                VEP version: {} and 
                genome build: {}""".format(
                    self.vep_version,
                    self.grch_build
                ).strip()
            self.log.critical(error_message)
            raise FileNotFoundError(error_message)

    def preprocess(self, datafile: pd.DataFrame):
        """
        Callable function for external modules to start call the preprocessor
        of the correctly chosen module.
        :param datafile: unprocessed pandas DataFrame
        :return: processed pandas Dataframe
        """
        processed_data = self.preprocessor.preprocess(
            dataset=datafile,
            is_train=self.train
        )
        return processed_data

    def predict(self, datafile: pd.DataFrame):
        """
        Callable function for external modules to start the call to the
        predict of the correctly chosen module.
        :param datafile: preprocessed pandas DataFrame
        :return: predicted pandas DataFrame
        """
        predicted_data = self.preprocessor.predict(data=datafile)
        return predicted_data

    def get_model_features(self):
        if self.preprocessor is None:
            error_message = "Preprocessor has to be initialized before " \
                            "model features can be requested."
            self.log.critical(error_message)
            raise InitializationError(error_message)
        return self.preprocessor.model_features
예제 #6
0
class CapiceImputing:
    """
    Class to dynamically load in all imputing files and identify the file
    suitable for the run's use case.
    """
    def __init__(self):
        self.manager = CapiceManager()
        self.vep_version = self.manager.vep_version
        self.grch_build = self.manager.grch_build
        self.log = Logger().logger
        self.log.info('Imputer started.')
        self.overrule = self.manager.overwrite_impute
        self.modules = []
        self.module = None
        self._load_modules()
        self._is_correct_datafile_present()
        self._check_if_imputer_is_applied()
        self.columns = []
        self.annotation_columns_present = []
        self.impute_values = {}
        self.pre_dtypes = {}
        self.dtypes = {}

    def _load_modules(self):
        """
        Method to dynamically load in all python files containing a class that
        contains the properties
            name and
            _json_name.
        If at the end of this function, the list of impute files is empty,
        will throw the module not found error.
        """
        self.log.info('Identifying imputing files.')
        directory = os.path.join(get_project_root_dir(), 'src', 'main',
                                 'python', 'resources', 'data_files',
                                 'imputing')
        usable_modules = load_modules(directory)
        if len(usable_modules) < 1:
            self._raise_no_module_found_error()
        loaded_modules = importer(usable_modules=usable_modules,
                                  path=directory)
        for module in loaded_modules:
            if "name" in dir(module) and "_json_name" in dir(module):
                self.modules.append(module)
        if len(self.modules) < 1:
            self._raise_no_module_found_error()
        self.log.info(
            'Identified {} files available for usage in imputing.'.format(
                len(self.modules)))

    def _raise_no_module_found_error(self):
        """
        Function to raise when no suitable impute files are found.
        Put into a function since 2 other functions within this module will use
        it.
        """
        error_message = 'No usable python files are found ' \
                        'within the imputing directory!'
        self.log.critical(error_message)
        raise FileNotFoundError(error_message)

    def _is_correct_datafile_present(self):
        """
        Function to check the VEP version and GRCh build
        (or --overwrite_impute_file)
        match the impute file.
        """
        for module in self.modules:
            if self.overrule and module.name == self.overrule:
                self.log.info(
                    'Overrule successful for: {} , located at: {}'.format(
                        self.overrule, inspect.getfile(module.__class__)))
                self.module = module
                break
            else:
                module_vep_version = module.supported_vep_version
                module_grch_build = module.supported_grch_build
                if module_vep_version == self.vep_version and \
                        module_grch_build == self.grch_build:
                    self.log.info('Impute data file successfully found: {} , '
                                  'located at: {}'.format(
                                      module.name,
                                      inspect.getfile(module.__class__)))
                    self.module = module
                    break

    def _check_if_imputer_is_applied(self):
        # Checking if self.data_file is assigned
        if self.module is None:
            if self.overrule:
                error_message = 'No imputing data file found for overrule: ' \
                                '{}'.format(self.overrule)
            else:
                error_message = 'No imputing data file found for ' \
                                'VEP version: {} and ' \
                                'GRCh build: {}'.format(self.vep_version,
                                                        self.grch_build
                                                        )
            self.log.critical(error_message)
            raise FileNotFoundError(error_message)

    def _load_values(self, dataset: pd.DataFrame):
        """
        Function to be called right when impute() is called,
        gets the input datafile features,
        imputes values from the impute file and
        saves the datafile features to the manager.
        """
        self.columns = self.module.annotation_features
        for col in self.columns:
            if col in dataset.columns:
                self.annotation_columns_present.append(col)
        self.manager.annotation_features = self.columns
        self.impute_values = self.module.impute_values

    def impute(self, datafile: pd.DataFrame):
        """
        Function to call the CapiceImputing to start imputing.
        :return: pandas DataFrame
        """
        self._load_values(datafile)
        datafile = self._check_chrom_pos(datafile)
        self._get_nan_ratio_per_column(dataset=datafile)
        self._get_full_nan_row(dataset=datafile)
        datafile.dropna(how='all', subset=self.annotation_columns_present)
        datafile = datafile[~datafile['CAPICE_drop_out']]
        datafile.drop(columns=['CAPICE_drop_out'], inplace=True)
        self._correct_dtypes(datafile=datafile)
        datafile.fillna(self.impute_values, inplace=True)
        datafile = datafile.astype(dtype=self.pre_dtypes, copy=False)
        datafile = datafile.astype(dtype=self.dtypes, copy=False)
        datafile = self._add_missing_columns(datafile)
        self.log.info('Imputing successfully performed.')
        return datafile

    @deprecated
    def _add_missing_columns(self, datafile: pd.DataFrame):
        for key, value in self.impute_values.items():
            if key not in datafile.columns:
                datafile[key] = value
        return datafile

    def _correct_dtypes(self, datafile: pd.DataFrame):
        """
        Function to correct the dtypes that originate from the lookup annotator
        according to the dtypes specified within the data json.
        """
        for key, item in self.impute_values.items():
            if key in datafile.columns:
                self._save_dtypes(key=key, item=item)

    def _save_dtypes(self, key, item):
        if isinstance(item, int):
            self.pre_dtypes[key] = float
        else:
            self.pre_dtypes[key] = type(item)
        self.dtypes[key] = type(item)

    def _check_chrom_pos(self, dataset: pd.DataFrame):
        """
        Function to check if all values of the columns Chr and Pos are present.
        :param dataset: not imputed pandas DataFrame
        :return: pandas DataFrame
            containing no NaN or gaps for Chr and Pos columns.
        """
        chrom_is_float = False
        if dataset['Chr'].isnull().values.any():
            if dataset.dtypes['Chr'] == np.float64:
                chrom_is_float = True
            n_delete = dataset['Chr'].isnull().values.sum()
            self.log.warning('Detected NaN in the Chromosome column! '
                             'Deleting {} row(s).'.format(n_delete))
            dataset = dataset[~dataset['Chr'].isnull()]
        if dataset['Pos'].isnull().values.any():
            n_delete = dataset['Pos'].isnull().values.sum()
            self.log.warning('Detected NaN is the Position column! '
                             'Deleting {} row(s).'.format(n_delete))
            dataset = dataset[~dataset['Pos'].isnull()]
        dataset.index = range(0, dataset.shape[0])
        if chrom_is_float:
            dataset['Chr'] = dataset['Chr'].astype(int)
            dataset['Chr'] = dataset['Chr'].astype(str)
        dataset['Pos'] = dataset['Pos'].astype(int)
        return dataset

    def _get_nan_ratio_per_column(self, dataset: pd.DataFrame):
        """
        Generic function to get the percentage of gaps per column
        :param dataset: not imputed pandas DataFrame
        """
        for column in dataset.columns:
            series = dataset[column]
            self._calculate_percentage_nan(column=series)

    def _calculate_percentage_nan(self, column):
        n_nan = column.isnull().sum()
        if n_nan > 0:
            n_samples = column.size
            p_nan = round((n_nan / n_samples) * 100, ndigits=2)
            self.log.debug(
                'NaN detected in column {}, percentage: {}%.'.format(
                    column.name, p_nan))

    def _get_full_nan_row(self, dataset: pd.DataFrame):
        """
        Function to get the samples of which absolutely no prediction is
        possible due to all non chr pos ref alt rows being gaps.
        :param dataset: not imputed pandas DataFrame
        """
        n_samples = dataset.shape[0]
        dataset.index = range(1, n_samples + 1)
        dataset['CAPICE_drop_out'] = dataset[
            self.annotation_columns_present].isnull().values.all(axis=1)
        samples_dropped_out = dataset[dataset['CAPICE_drop_out']]
        if samples_dropped_out.shape[0] > 0:
            self.log.warning(
                'The following samples are filtered out due to missing values: '
                '(indexing is python based, '
                'so the index starts at 0). \n {}'.format(samples_dropped_out[[
                    'Chr', 'Pos', 'Ref', 'Alt', 'FeatureID'
                ]]))
        else:
            self.log.info(
                'No samples are filtered out due to too many NaN values.')
예제 #7
0
class InputVersionChecker:
    """
    Class to check the given VEP config argument and file VEP version match.

    Class is self running.
    """
    def __init__(self, config_vep_version: float, file_vep_version: float,
                 config_grch_build: int, file_grch_build: int):
        """
        Class to check the given VEP config argument and
        the header of the VEP file match.
        :param config_vep_version: float,
            config argument for the used VEP version
        :param file_vep_version: flaot,
            config argument for the used GRCh build
        """
        self.config_vep_version = config_vep_version
        self.file_vep_version = file_vep_version
        self.config_grch_build = config_grch_build
        self.file_grch_build = file_grch_build
        self.manager = CapiceManager()
        self.export_vep_version = None
        self.export_grch_build = None
        self.check_match = []
        self.unable_check = []
        self.check_overrule = False
        self.log = Logger().logger
        self._check_all_present()
        if self.check_overrule:
            self._check_overrule()
        self._check_version_match()
        self._set_global_vep_version()
        self._set_global_grch_build()

    def _set_global_vep_version(self):
        """
        Function to provide the CapiceManager with the VEP version to be used
        globally later on in CAPICE.
        """
        self.manager.vep_version = self.export_vep_version
        self.log.info('VEP version set to: {}'.format(self.export_vep_version))

    def _set_global_grch_build(self):
        """
        Function to provide the CapiceManager with the Genome Build version to
        be used globally later on in CAPICE.
        """
        self.manager.grch_build = self.export_grch_build
        self.log.info('GRCh build set to: {}'.format(self.export_grch_build))

    def _check_overrule(self):
        """
        Function called when either the
            VEP version or
            GRCh build
        can not be determined.
        Overrule must be present for both impute and model,
        since it can not determine what file to use without VEP or
        GRCh argument.
        """
        if self.manager.overwrite_impute is False and \
                self.manager.overwrite_model is False:
            error_message = """
            VEP version or GRCh build not specified and both overwrites are not 
            set! Not able to find a correct impute or processing file!
            """.strip()
            self.log.critical(error_message)
            raise InputError(error_message)

    def _check_all_present(self):
        """
        Function to check if both the VEP version and GRCh build are present
        within either the config arguments or within the file.
        """
        dict_of_all_present = {
            'VEP': [self.file_vep_version, self.config_vep_version],
            'GRCh': [self.file_grch_build, self.config_grch_build]
        }
        for type_of_check in dict_of_all_present.keys():
            to_check = dict_of_all_present[type_of_check]
            self._check_individual_argument(to_check=to_check,
                                            type_of_check=type_of_check)

    def _check_individual_argument(self, to_check, type_of_check):
        """
        Function belonging to _check_all_present to check if a VEP version and
        GRCh build can be set globally.
        :param to_check: list
        :param type_of_check: string
        """
        if False in to_check:
            if to_check.count(False) == len(to_check):
                self._turn_on_check_overrule(type_of_check=type_of_check)
            for argument in to_check:
                self._apply_export_version(argument=argument,
                                           type_of_check=type_of_check)
        else:
            self.check_match.append(type_of_check)

    def _turn_on_check_overrule(self, type_of_check):
        """
        Function to turn on the overrule check if no VEP or GRCh arguments are
        passed.
        """
        self.check_overrule = type_of_check
        self.log.warning(
            'Unable to obtain {} version from file or config file!'.format(
                type_of_check))
        self.check_overrule = True

    def _apply_export_version(self, argument, type_of_check):
        """
        Function to set the global VEP version or GRCh build.
        :param argument: int or float
        """
        if argument is not False:
            if type_of_check == 'VEP':
                self.export_vep_version = argument
            else:
                self.export_grch_build = argument

    def _check_version_match(self):
        """
        Function to check if the Config Argument and the file header specified
        VEP versions match.
        If not: use the config argument as form of "overwrite" and warn.
        """
        if len(self.check_match) > 0:
            for check_match in self.check_match:
                if check_match == 'VEP':
                    self._check_vep_match(check_match=check_match)
                elif check_match == 'GRCh':
                    self._check_grch_match(check_match=check_match)

    def _check_vep_match(self, check_match):
        if self.file_vep_version != self.config_vep_version:
            self._raise_version_mismatch(type_of_mismatch=check_match,
                                         version_cla=self.file_vep_version,
                                         version_file=self.config_vep_version)
        else:
            self._raise_version_mismatch(type_of_mismatch=check_match,
                                         match_successful=True)
        self.export_vep_version = self.file_vep_version

    def _check_grch_match(self, check_match):
        if self.config_grch_build != self.file_grch_build:
            self._raise_version_mismatch(type_of_mismatch=check_match,
                                         version_cla=self.config_grch_build,
                                         version_file=self.file_grch_build)
        else:
            self._raise_version_mismatch(type_of_mismatch=check_match,
                                         match_successful=True)
        self.export_grch_build = self.file_grch_build

    def _raise_version_mismatch(self,
                                type_of_mismatch,
                                version_cla=None,
                                version_file=None,
                                match_successful=False):
        if match_successful:
            self.log.info(
                'Successfully matched CLA and file versions for {}.'.format(
                    type_of_mismatch))
        else:
            warning_message = """
            Warning matching {} versions. 
            CLA version supplied: 
            {} does not match file version: {} !""".format(
                type_of_mismatch, version_cla, version_file).strip()
            warnings.warn(warning_message)
            self.log.warning(warning_message)