예제 #1
0
class Annotator:
    def __init__(self, dataset: pd.DataFrame):
        self.log = Logger().logger
        self.manager = CapiceManager()
        self.fasta_lookup = FastaLookupAnnotator()
        self.manual_annotater = ManualAnnotator()
        self.dataset = dataset

    def annotate(self):
        """
        Start the annotation process.
        :return: pandas dataframe
        """
        self.log.info('Starting manual annotation process.')
        self._add_sequence()
        self.dataset = self.manual_annotater.process(dataset=self.dataset)
        self.log.info('Annotation successful.')
        self.log.debug(
            'Final shape of the annotated data: {}'.format(self.dataset.shape)
        )
        return self.dataset

    def _add_sequence(self):
        self.log.debug('Annotation addition: sequence')
        self.dataset['Seq'] = self.dataset.apply(
            lambda x: self.fasta_lookup.get_reference_sequence(
                chromosome=x['Chr'],
                start=x['Pos'] - 75,
                end=x['Pos'] + 75
            ), axis=1
        )
        self.fasta_lookup.close_connection()
예제 #2
0
class FastaLookupAnnotator:
    def __init__(self):
        self.log = Logger().logger
        self.manager = CapiceManager()
        self.fasta_loc = self.manager.reference_genome
        self.fasta = None
        self._load_fasta()

    def _load_fasta(self):
        self.log.info('Loading in Fasta file, this may take a moment.')
        self.fasta = pysam.FastaFile(self.fasta_loc)
        self.log.info('Succesfully loaded Fasta file at: {}'.format(
            self.fasta_loc))

    def get_reference_sequence(self, chromosome: str, start: int, end: int):
        """
        Function to obtain a sequence from the reference Fasta file.

        :param chromosome: string,
            chromosome to get the reference sequence from.
        :param start: Chromosomal position at what point the sequence
        should be obtained.

        :param end: Chromosomal position at what point the obtained sequence
        should end.

        :return: string, obtained reference sequence.
        """
        try:
            self.log.debug('Obtaining reference sequence for: '
                           '[Chromosome: {}], [start: {}], [stop: {}]'.format(
                               chromosome, start, end))
            append_ns = False
            if start < 0:
                append_ns = abs(start)
                start = 0
            return_sequence = self.fasta.fetch(chromosome, start, end)
            if append_ns:
                return_sequence = '{}{}'.format('N' * append_ns,
                                                return_sequence)
            return return_sequence
        except KeyError:
            self.log.warning(
                'Unable to obtain sequence for: '
                '[Chromosome: {}], [start: {}], [stop: {}],'
                'did you supply a reference with contigs 1-22 + x,y,mt?'.
                format(chromosome, start, end))
            return None

    def close_connection(self):
        """
        Function to tell pysam to close the connection to the Fasta file
        """
        if self.fasta:
            self.fasta.close()
예제 #3
0
class LoadFilePreProcessor:
    def __init__(self, dataset: pd.DataFrame):
        self.dataset = dataset
        self.log = Logger().logger

    def process(self):
        """
        Function to start the LoadFilePreProcessor to correct the input file of
        each column starting with % and the renaming of certain columns,
        like #CHROM to Chr.

        Returns
        -------
        dataset :   pandas.DataFrame
                    Processed dataset with corrected % sign and renamed columns.
        """
        self.log.debug('Starting correcting % sign.')
        self._correct_percentage_sign()
        self.log.debug('% sign corrected, starting renaming of columns.')
        self._col_renamer()
        self.dataset['Chr'] = self.dataset['Chr'].astype(str)
        self.log.info('LoadFilePreProcessor successful.')
        return self.dataset

    def _correct_percentage_sign(self):
        new_columns = []
        for column in self.dataset.columns:
            if column.startswith('%'):
                new_columns.append(column.split('%')[1])
            elif column.startswith('#'):
                new_columns.append(column.split('#')[1])
            else:
                new_columns.append(column)
        self.dataset.columns = new_columns

    def _col_renamer(self):
        """
        Function to rename "Gene, Feature, SYMBOL, INTRON and EXON" to
        "GeneID, FeatureID, GeneName, Intron and Exon".
        """
        self.dataset.rename(columns={
            'CHROM': 'Chr',
            'POS': 'Pos',
            'REF': 'Ref',
            'ALT': 'Alt',
            'SYMBOL_SOURCE': 'SourceID',
            'Feature': 'FeatureID',
            'SYMBOL': 'GeneName',
            'INTRON': 'Intron',
            'EXON': 'Exon'
        },
                            inplace=True)
예제 #4
0
class InputParser:
    def __init__(self):
        self.sep = '\t'
        self.log = Logger().logger

    def set_separator(self, sep: str):
        """
        Function to overwrite the default separator 'tab'.
        Currently has no real function, but might be implemented
        in the future if the default separator in the VEP output changes and
        the separator has to be dynamically changed.

        :param sep: str, to be used separator in the pandas.read_csv call
        """
        self.sep = sep

    def parse(self, input_file_loc: str, skip_rows: int):
        """
        Class to start the parsing of additional information from the input
        file.

        :param input_file_loc: str, direction to the input file
        :param skip_rows: int,
            the amount of comment strings pandas has to skip first
        """
        if self.sep == '\t':
            used_sep = 'Tab'
        else:
            used_sep = self.sep
        self.log.info('Reading VEP file from: {} using separator: {}'.format(
            input_file_loc, used_sep))
        input_file = pd.read_csv(input_file_loc,
                                 sep=self.sep,
                                 skiprows=skip_rows,
                                 na_values='.',
                                 low_memory=False)
        input_file.dropna(how='all', inplace=True)
        input_file.drop_duplicates(inplace=True)
        self.log.info('Input file at {} loaded with {} samples.'.format(
            input_file_loc, input_file.shape[0]))
        return input_file
예제 #5
0
class PreProcessor:
    """
    Class to dynamically load in all model files for preprocessing and choosing
    the correct preprocessing file according to the given config arguments or
    parsed VEP file header. (or the --overwrite_model_file argument)
    """
    def __init__(self, is_train: bool = False):
        self.manager = CapiceManager()
        self.log = Logger().logger
        self.log.info('Preprocessor started.')
        self.overrule = self.manager.overwrite_model
        self.vep_version = self.manager.vep_version
        self.grch_build = self.manager.grch_build
        self.train = is_train
        self.preprocessors = []
        self.preprocessor = None
        self._prepare_preprocessor()

    def _prepare_preprocessor(self):
        """
        Function to see if the training protocol should be used or the
        preprocessors should be loaded in.
        """
        if self.train:
            from src.main.python.resources.models.training_preprocessor import \
                TrainPreprocessor
            self.preprocessor = TrainPreprocessor()
        else:
            self._load_preprocessors()
            self._load_correct_preprocessor()
        self._check_preprocessor_is_applied()

    def _load_preprocessors(self):
        """
        Function to dynamically load in the preprocessors modules,
        but must have the following properties:
            name,
            supported_vep_version and
            supported_genomebuild_version.
        """
        self.log.info('Identifying preprocessing files.')
        directory = os.path.join(get_project_root_dir(),
                                 'src',
                                 'main',
                                 'python',
                                 'resources',
                                 'models')
        usable_modules = load_modules(directory)
        if len(usable_modules) < 1:
            self._raise_no_module_found_error()
        imported_modules = importer(
            usable_modules=usable_modules,
            path=directory
        )
        for module in imported_modules:
            if "name" in dir(module) and "supported_vep_version" in dir(
                    module) and "supported_grch_build" in dir(module):
                self.preprocessors.append(module)
        if len(self.preprocessors) < 1:
            self._raise_no_module_found_error()
        self.log.info(
            'Succesfully loaded {} preprocessors.'.format(
                len(self.preprocessors)
            )
        )

    def _raise_no_module_found_error(self):
        """
        Specialized function to be used into
            _load_preprocessors() and
            _load_correct_preprocessor()
        to be raised when no preprocessing files can be found.
        """
        error_message = 'No usable python files are ' \
                        'found within the model directory!'
        self.log.critical(error_message)
        raise FileNotFoundError(error_message)

    def _load_correct_preprocessor(self):
        """
        Function to check the dynamically loaded preprocessors to match either
        the overrule argument or the vep version and genome build.
        """
        for preprocessor in self.preprocessors:
            if self.overrule and preprocessor.name == self.overrule:
                self.log.info(
                    'Overrule successful for: {} , located at: {}'.format(
                        self.overrule,
                        inspect.getfile(preprocessor.__class__)
                    )
                )
                self.preprocessor = preprocessor
                break
            else:
                module_vep = preprocessor.supported_vep_version
                module_grch = preprocessor.supported_grch_build
                if module_vep == self.vep_version and \
                        module_grch == self.grch_build:
                    self.log.info("""
                    Preprocessing and model file successfully found: {} , 
                    Located at: {}
                    """.format(
                        preprocessor.name,
                        inspect.getfile(preprocessor.__class__)
                    ).strip()
                    )
                    self.preprocessor = preprocessor
                    break

    def _check_preprocessor_is_applied(self):
        if self.preprocessor is None:
            if self.overrule:
                error_message = 'No model data file found for overrule: ' \
                                '{}'.format(self.overrule)
            else:
                error_message = """
                No model data file found for 
                VEP version: {} and 
                genome build: {}""".format(
                    self.vep_version,
                    self.grch_build
                ).strip()
            self.log.critical(error_message)
            raise FileNotFoundError(error_message)

    def preprocess(self, datafile: pd.DataFrame):
        """
        Callable function for external modules to start call the preprocessor
        of the correctly chosen module.
        :param datafile: unprocessed pandas DataFrame
        :return: processed pandas Dataframe
        """
        processed_data = self.preprocessor.preprocess(
            dataset=datafile,
            is_train=self.train
        )
        return processed_data

    def predict(self, datafile: pd.DataFrame):
        """
        Callable function for external modules to start the call to the
        predict of the correctly chosen module.
        :param datafile: preprocessed pandas DataFrame
        :return: predicted pandas DataFrame
        """
        predicted_data = self.preprocessor.predict(data=datafile)
        return predicted_data

    def get_model_features(self):
        if self.preprocessor is None:
            error_message = "Preprocessor has to be initialized before " \
                            "model features can be requested."
            self.log.critical(error_message)
            raise InitializationError(error_message)
        return self.preprocessor.model_features
예제 #6
0
class Exporter:
    """
    Class specifically export files and create unique filenames.
    """
    def __init__(self, file_path):
        self.log = Logger().logger
        self.force = CapiceManager().force
        self.now = CapiceManager().now
        self.capice_filename = CapiceManager().output_filename
        self.file_path = file_path
        self.export_cols = [
            Column.chr_pos_ref_alt.value, Column.GeneName.value,
            Column.FeatureID.value, Column.Consequence.value,
            Column.probabilities.value
        ]

    def export_capice_prediction(self, datafile: pd.DataFrame):
        """
        Function specific to export the dataset created for the prediction
        pathway.
        :param datafile: prediction pandas DataFrame
        """
        filename = self._export_filename_ready(file_name=self.capice_filename,
                                               check_extension=False)
        datafile[self.export_cols].to_csv(filename,
                                          sep='\t',
                                          compression='gzip',
                                          index=False)
        self.log.info(
            'Successfully exported CAPICE datafile to: {}'.format(filename))

    def export_capice_training_dataset(self, datafile: pd.DataFrame, name: str,
                                       feature: str):
        """
        Function specific to export a (splitted) dataset comming from the
        training pathway.
        :param datafile: pandas DataFrame
        :param name: Name of the export file
        :param feature: Name of what is exported
        """
        filename = self._export_filename_ready(file_name=name)
        datafile.to_csv(filename, sep='\t', compression='gzip', index=False)
        self.log.info('Exported {} with shape {} to: {}'.format(
            feature, datafile.shape, filename))

    def export_capice_model(self, model, model_type):
        """
        Function specific to export a newly created CAPICE model
        :param model: RandomizedSearchCV or XGBClassifier instance
        :param model_type: either "XGBClassifier" or "RandomizedSearchCV"
        """
        export_name = ""
        if model_type == 'XGBClassifier':
            export_name = 'xgb_classifier'
        elif model_type == 'RandomizedSearchCV':
            export_name = 'randomized_search_cv'
        filename_model = '{}_{}'.format(export_name,
                                        self.now.strftime("%H%M%S%f_%d%m%Y"))
        filename = self._export_filename_ready(file_name=filename_model,
                                               type_export='model')
        with open(filename, 'wb') as model_dump:
            pickle.dump(model, model_dump)

    def _export_filename_ready(self,
                               file_name,
                               type_export='dataset',
                               check_extension=True):
        """
        Function to build an unique filename in case that force is turned off.
        :param file_name: Name of the to be created file
        :param type_export: "prediction" for the prediction pathway,
            "dataset" for the export of datasets or
            "model" for the export of models.
        :param check_extension: Boolean if the extension should be
        checked before exporting.
        :return: full export path
        """
        path_and_filename = os.path.join(self.file_path, file_name)
        types_export_and_extensions = {
            'dataset': '.tsv.gz',
            'model': '.pickle.dat'
        }
        if check_extension:
            extension = types_export_and_extensions[type_export]
            if not file_name.endswith(extension):
                file_name = file_name + extension
        full_path = os.path.join(self.file_path, file_name)
        export_path = None
        if not check_file_exists(full_path):
            self.log.info(
                'No file found at {}, save to create.'.format(full_path))
            export_path = full_path
        elif self.force and check_file_exists(full_path):
            self.log.warning(
                'Found existing file at {}, '
                'removing file for overwriting.'.format(full_path))
            os.remove(full_path)
            export_path = full_path
        else:
            self.log.info('Found existing file at {}, '
                          'not able to overwrite. '
                          'Creating new filename.'.format(full_path))
            filename, extension = get_filename_and_extension(full_path)
            basedir = os.path.dirname(path_and_filename)
            export_exists = True
            extension_counter = 1
            while export_exists:
                attempted_file = os.path.join(
                    basedir,
                    filename + "_{}.".format(extension_counter) + extension)
                if not check_file_exists(attempted_file):
                    self.log.info('Able to create {}'.format(attempted_file))
                    export_exists = False
                    export_path = attempted_file
                extension_counter += 1
        return export_path
예제 #7
0
class CapiceImputing:
    """
    Class to dynamically load in all imputing files and identify the file
    suitable for the run's use case.
    """
    def __init__(self):
        self.manager = CapiceManager()
        self.vep_version = self.manager.vep_version
        self.grch_build = self.manager.grch_build
        self.log = Logger().logger
        self.log.info('Imputer started.')
        self.overrule = self.manager.overwrite_impute
        self.modules = []
        self.module = None
        self._load_modules()
        self._is_correct_datafile_present()
        self._check_if_imputer_is_applied()
        self.columns = []
        self.annotation_columns_present = []
        self.impute_values = {}
        self.pre_dtypes = {}
        self.dtypes = {}

    def _load_modules(self):
        """
        Method to dynamically load in all python files containing a class that
        contains the properties
            name and
            _json_name.
        If at the end of this function, the list of impute files is empty,
        will throw the module not found error.
        """
        self.log.info('Identifying imputing files.')
        directory = os.path.join(get_project_root_dir(), 'src', 'main',
                                 'python', 'resources', 'data_files',
                                 'imputing')
        usable_modules = load_modules(directory)
        if len(usable_modules) < 1:
            self._raise_no_module_found_error()
        loaded_modules = importer(usable_modules=usable_modules,
                                  path=directory)
        for module in loaded_modules:
            if "name" in dir(module) and "_json_name" in dir(module):
                self.modules.append(module)
        if len(self.modules) < 1:
            self._raise_no_module_found_error()
        self.log.info(
            'Identified {} files available for usage in imputing.'.format(
                len(self.modules)))

    def _raise_no_module_found_error(self):
        """
        Function to raise when no suitable impute files are found.
        Put into a function since 2 other functions within this module will use
        it.
        """
        error_message = 'No usable python files are found ' \
                        'within the imputing directory!'
        self.log.critical(error_message)
        raise FileNotFoundError(error_message)

    def _is_correct_datafile_present(self):
        """
        Function to check the VEP version and GRCh build
        (or --overwrite_impute_file)
        match the impute file.
        """
        for module in self.modules:
            if self.overrule and module.name == self.overrule:
                self.log.info(
                    'Overrule successful for: {} , located at: {}'.format(
                        self.overrule, inspect.getfile(module.__class__)))
                self.module = module
                break
            else:
                module_vep_version = module.supported_vep_version
                module_grch_build = module.supported_grch_build
                if module_vep_version == self.vep_version and \
                        module_grch_build == self.grch_build:
                    self.log.info('Impute data file successfully found: {} , '
                                  'located at: {}'.format(
                                      module.name,
                                      inspect.getfile(module.__class__)))
                    self.module = module
                    break

    def _check_if_imputer_is_applied(self):
        # Checking if self.data_file is assigned
        if self.module is None:
            if self.overrule:
                error_message = 'No imputing data file found for overrule: ' \
                                '{}'.format(self.overrule)
            else:
                error_message = 'No imputing data file found for ' \
                                'VEP version: {} and ' \
                                'GRCh build: {}'.format(self.vep_version,
                                                        self.grch_build
                                                        )
            self.log.critical(error_message)
            raise FileNotFoundError(error_message)

    def _load_values(self, dataset: pd.DataFrame):
        """
        Function to be called right when impute() is called,
        gets the input datafile features,
        imputes values from the impute file and
        saves the datafile features to the manager.
        """
        self.columns = self.module.annotation_features
        for col in self.columns:
            if col in dataset.columns:
                self.annotation_columns_present.append(col)
        self.manager.annotation_features = self.columns
        self.impute_values = self.module.impute_values

    def impute(self, datafile: pd.DataFrame):
        """
        Function to call the CapiceImputing to start imputing.
        :return: pandas DataFrame
        """
        self._load_values(datafile)
        datafile = self._check_chrom_pos(datafile)
        self._get_nan_ratio_per_column(dataset=datafile)
        self._get_full_nan_row(dataset=datafile)
        datafile.dropna(how='all', subset=self.annotation_columns_present)
        datafile = datafile[~datafile['CAPICE_drop_out']]
        datafile.drop(columns=['CAPICE_drop_out'], inplace=True)
        self._correct_dtypes(datafile=datafile)
        datafile.fillna(self.impute_values, inplace=True)
        datafile = datafile.astype(dtype=self.pre_dtypes, copy=False)
        datafile = datafile.astype(dtype=self.dtypes, copy=False)
        datafile = self._add_missing_columns(datafile)
        self.log.info('Imputing successfully performed.')
        return datafile

    @deprecated
    def _add_missing_columns(self, datafile: pd.DataFrame):
        for key, value in self.impute_values.items():
            if key not in datafile.columns:
                datafile[key] = value
        return datafile

    def _correct_dtypes(self, datafile: pd.DataFrame):
        """
        Function to correct the dtypes that originate from the lookup annotator
        according to the dtypes specified within the data json.
        """
        for key, item in self.impute_values.items():
            if key in datafile.columns:
                self._save_dtypes(key=key, item=item)

    def _save_dtypes(self, key, item):
        if isinstance(item, int):
            self.pre_dtypes[key] = float
        else:
            self.pre_dtypes[key] = type(item)
        self.dtypes[key] = type(item)

    def _check_chrom_pos(self, dataset: pd.DataFrame):
        """
        Function to check if all values of the columns Chr and Pos are present.
        :param dataset: not imputed pandas DataFrame
        :return: pandas DataFrame
            containing no NaN or gaps for Chr and Pos columns.
        """
        chrom_is_float = False
        if dataset['Chr'].isnull().values.any():
            if dataset.dtypes['Chr'] == np.float64:
                chrom_is_float = True
            n_delete = dataset['Chr'].isnull().values.sum()
            self.log.warning('Detected NaN in the Chromosome column! '
                             'Deleting {} row(s).'.format(n_delete))
            dataset = dataset[~dataset['Chr'].isnull()]
        if dataset['Pos'].isnull().values.any():
            n_delete = dataset['Pos'].isnull().values.sum()
            self.log.warning('Detected NaN is the Position column! '
                             'Deleting {} row(s).'.format(n_delete))
            dataset = dataset[~dataset['Pos'].isnull()]
        dataset.index = range(0, dataset.shape[0])
        if chrom_is_float:
            dataset['Chr'] = dataset['Chr'].astype(int)
            dataset['Chr'] = dataset['Chr'].astype(str)
        dataset['Pos'] = dataset['Pos'].astype(int)
        return dataset

    def _get_nan_ratio_per_column(self, dataset: pd.DataFrame):
        """
        Generic function to get the percentage of gaps per column
        :param dataset: not imputed pandas DataFrame
        """
        for column in dataset.columns:
            series = dataset[column]
            self._calculate_percentage_nan(column=series)

    def _calculate_percentage_nan(self, column):
        n_nan = column.isnull().sum()
        if n_nan > 0:
            n_samples = column.size
            p_nan = round((n_nan / n_samples) * 100, ndigits=2)
            self.log.debug(
                'NaN detected in column {}, percentage: {}%.'.format(
                    column.name, p_nan))

    def _get_full_nan_row(self, dataset: pd.DataFrame):
        """
        Function to get the samples of which absolutely no prediction is
        possible due to all non chr pos ref alt rows being gaps.
        :param dataset: not imputed pandas DataFrame
        """
        n_samples = dataset.shape[0]
        dataset.index = range(1, n_samples + 1)
        dataset['CAPICE_drop_out'] = dataset[
            self.annotation_columns_present].isnull().values.all(axis=1)
        samples_dropped_out = dataset[dataset['CAPICE_drop_out']]
        if samples_dropped_out.shape[0] > 0:
            self.log.warning(
                'The following samples are filtered out due to missing values: '
                '(indexing is python based, '
                'so the index starts at 0). \n {}'.format(samples_dropped_out[[
                    'Chr', 'Pos', 'Ref', 'Alt', 'FeatureID'
                ]]))
        else:
            self.log.info(
                'No samples are filtered out due to too many NaN values.')
예제 #8
0
class TemplateSetup(metaclass=ABCMeta):
    """
    Abstract class to act as template for new models that might be
    added in future patches of CAPICE.
    Contains the necessary steps for preprocessing as well.
    """

    def __init__(self, name, usable, vep_version, grch_build):
        self.log = Logger().logger
        self.property_checker = PropertyCheckerLogger()
        self.name = name
        self.usable = usable
        self.supported_vep_version = vep_version
        self.supported_grch_build = grch_build
        self.annotation_features = CapiceManager().annotation_features
        self.train = False
        self.model = None
        self.annotation_object = []
        self.model_features = None

    @property
    def name(self):
        """
        Property getter name, to get the init defined name of the model module.

        :return: str
        """
        return self._name

    @name.setter
    def name(self, value='Template'):
        """
        Property setter name, to set a name for a model module.
        Raises TypeError if not supplied with a string.

        :param value: str
        """
        self.property_checker.check_property(value=value, expected_type=str)
        self._name = value

    @property
    def usable(self):
        """
        Property getter usable, to get the boolean value of a model module
        whenever it can be used for preprocessing and prediction.

        :return: bool
        """
        return self._usable

    @usable.setter
    def usable(self, value=False):
        """
        Property setter usable, to set the boolean value of a model module
        whenever it should be used for preprocessing and prediction.
        Raises TypeError if not supplied with a boolean.

        :param value: bool
        """
        self.property_checker.check_property(value=value, expected_type=bool)
        self._usable = value

    @property
    def supported_vep_version(self):
        """
        Property getter supported_annotation_version,
        to get the float annotation_version value of a model/prediction file
        that is supported within the module.

        :return: float or None
        """
        return self._vep_version

    @supported_vep_version.setter
    def supported_vep_version(self, value):
        """
        Property setter supported_annotation_version,
        to set the float annotation_version value of a model/prediction file
        that is supported within the module.
        Raises TypeError if not supplied with a float or None.

        :param value: float or None
        """
        self.property_checker.check_property(
            value=value,
            expected_type=float,
            include_none=True
        )
        self._vep_version = value

    @property
    def supported_grch_build(self):
        """
        Property getter supported_grch_build,
        to get the integer grch_build value that defines what genome build
        is supported by the model/prediction module.

        :return: integer or None
        """
        return self._grch_build

    @supported_grch_build.setter
    def supported_grch_build(self, value):
        """
        Property getter supported_grch_build,
        to set the integer value grch_build that defines what genome build
        is supported by the model/prediction module.
        Raises TypeError if not supplied with an integer or None.

        :param value: integer or None
        """
        self.property_checker.check_property(
            value=value,
            expected_type=int,
            include_none=True
        )
        self._grch_build = value

    def preprocess(self, dataset: pd.DataFrame, is_train: bool):
        """
        Callable function to start the preprocessing of a dataset.
        :param dataset: imputed pandas DataFrame
        :param is_train: boolean
        :return: processed pandas DataFrame
        """
        self.train = is_train
        self._load_model()
        if not self.train:
            self._load_model_features()
        dataset = self._duplicate_chr_pos_ref_alt(dataset=dataset)
        self._get_categorical_columns(dataset=dataset)
        processed_dataset = self._process_objects(dataset=dataset)
        if not self.train:
            processed_dataset = self._check_all_model_features_present(
                processed_dataset
            )
        self.log.info('Successfully preprocessed data.')
        return processed_dataset

    @deprecated
    def _check_all_model_features_present(self, dataset: pd.DataFrame):
        for feature in self.model_features:
            if feature not in dataset.columns:
                dataset[feature] = 0
        return dataset

    def _get_categorical_columns(self, dataset: pd.DataFrame):
        """
        Function to get the categorical columns that are within the supplied
        annotation features of the imputing file.
        :param dataset: pandas DataFrame
        """
        for feature in dataset.select_dtypes(include=["O"]).columns:
            if feature in self.annotation_features:
                self.annotation_object.append(feature)
        self.log.debug(
            'Converting the categorical columns: {}.'.format(
                ", ".join(self.annotation_object)
            )
        )

    @staticmethod
    def _duplicate_chr_pos_ref_alt(dataset):
        """
        Function to create the chr_pos_ref_alt column so that it doesn't get
        lost in preprocessing.
        :param dataset: unprocessed pandas DataFrame
        :return: unprocessed pandas DataFrame
            containing column 'chr_pos_ref_alt'
        """
        dataset['chr_pos_ref_alt'] = dataset[
            ['Chr', 'Pos', 'Ref', 'Alt']].astype(str).agg('_'.join, axis=1)
        return dataset

    @property
    def model_features(self):
        return self._model_features

    @model_features.setter
    def model_features(self, value):
        self._model_features = value

    def _process_objects(self, dataset: pd.DataFrame):
        """
        (If train) will create a dictionary telling the processor how many
        categories are within a certain column.
        If not train: Will look up each annotation feature from the impute file
        within the columns of the datafile (either in full name or the column
        starts with the feature from the impute file).
        This dictionary is then passed to the actual processor.
        :param dataset: unprocessed pandas DataFrame
        :return: processed pandas DataFrame
        """
        annotation_feats_dict = {}
        if self.train:
            hardcoded_features = ['Ref', 'Alt', 'Domain']
            for feature in hardcoded_features:
                annotation_feats_dict[feature] = 5
            self.log.info(
                'Training protocol, '
                'creating new categorical conversion identifiers.'
            )
            for feat in self.annotation_object:
                if feat not in annotation_feats_dict.keys():
                    annotation_feats_dict[feat] = 5
        else:
            for feature in self.annotation_object:
                annotation_feats_dict = self._process_objects_no_train(
                    feature=feature,
                    annotation_features_dict=annotation_feats_dict
                )
        processed_data = self._process_categorical_vars(
            dataset=dataset,
            annotation_feats_dict=annotation_feats_dict
        )
        return processed_data

    def _process_objects_no_train(self, feature: str,
                                  annotation_features_dict: dict):
        for model_feature in self.model_features:
            if model_feature.startswith(feature):
                extension = model_feature.split(''.join([feature, '_']))[-1]
                if feature in annotation_features_dict.keys():
                    annotation_features_dict[feature].append(extension)
                else:
                    annotation_features_dict[feature] = [extension]
        return annotation_features_dict

    def _load_model_features(self):
        """
        Function to access the protected member of the XGBoost _Booster class
        to get the features that the model is trained on.
        :return: list
        """
        self.log.info('Using features saved within the model.')
        self.model_features = self.model._Booster.feature_names

    def _process_categorical_vars(self,
                                  dataset: pd.DataFrame,
                                  annotation_feats_dict: dict):
        """
        Processor of categorical columns. Will create new columns based on the
        quantity of a value within a column.
        :param dataset: unprocessed pandas DataFrame
        :param annotation_feats_dict:
            dictionary that is to contain the levels for each categorical
            feature
        :return: processed pandas DataFrame
        """
        if self.train:
            for annotation_feature in annotation_feats_dict.keys():
                feature_names = self._get_top10_or_less_cats(
                    column=dataset[annotation_feature],
                    return_num=annotation_feats_dict[annotation_feature]
                )
                dataset[annotation_feature] = np.where(
                    dataset[annotation_feature].isin(feature_names),
                    dataset[annotation_feature],
                    'other')
        else:
            for annotation_feature in annotation_feats_dict.keys():
                feature_names = annotation_feats_dict[annotation_feature]
                self.log.debug('For feature: {} loaded {} levels: {}'.format(
                    annotation_feature,
                    len(feature_names),
                    feature_names
                ))
                dataset[annotation_feature] = np.where(
                    dataset[annotation_feature].isin(feature_names),
                    dataset[annotation_feature],
                    'other'
                )
        dataset = pd.get_dummies(
            dataset,
            columns=list(annotation_feats_dict.keys())
        )

        # Checking if all annotation features are processed.
        # If not, add a column containing all "false" (0)
        for annotation_feature in annotation_feats_dict.keys():
            dataset = self._check_all_annotation_features_processed(
                current_annotation_feature=annotation_feature,
                dataset=dataset,
                annotation_features_dict=annotation_feats_dict
            )

        return dataset

    def _check_all_annotation_features_processed(self,
                                                 current_annotation_feature,
                                                 dataset: pd.DataFrame,
                                                 annotation_features_dict):
        if not self.train:
            afd = annotation_features_dict
            for processed_feature in afd[current_annotation_feature]:
                col_be_present = "_".join(
                    [current_annotation_feature, processed_feature])
                if col_be_present not in dataset.columns:
                    self.log.warning(
                        'Of annotation feature {},'
                        ' detected {} not present in columns.'.format(
                            current_annotation_feature, processed_feature))
                    dataset[col_be_present] = 0
        return dataset

    def _get_top10_or_less_cats(self, column: pd.Series, return_num: int):
        """
        Function for when a training file is preprocessed to get the top
        return_num quantity values within a categorical column.
        Some converting is done for the logger to be able to print them.
        :param column: pandas Series
        :param return_num: integer
        :return: pandas Series
        """
        value_counts = column.value_counts().index[:return_num].values
        printable_value_counts = []
        for value in value_counts:
            if not isinstance(value, str):
                value = str(value)
            printable_value_counts.append(value)
        self.log.info('For feature: {} saved the following values: {}'.format(
            column.name,
            ', '.join(printable_value_counts)
        ))
        return value_counts

    # Model stuff

    def predict(self, data: pd.DataFrame):
        """
        Function to load the model and predict the CAPICE scores.
        Can be overwritten in case of legacy support.
        :return: pandas DataFrame
        """
        self.log.info('Predicting for {} samples.'.format(data.shape[0]))
        self._load_model()
        self._load_model_features()
        data['probabilities'] = self._predict(
            self._create_input_matrix(dataset=data))
        self.log.info('Predicting successful.')
        return data

    def _predict(self, predict_data):
        """
        Further down defined prediction function, which is different for
        XGBoost 0.72.1 and current XGBoost version.
        :param predict_data: preprocessed pandas DataFrame
        :return: numpy array
        """
        return self.model.predict_proba(predict_data)[:, 1]

    def _create_input_matrix(self, dataset: pd.DataFrame):
        """
        Also a template function, which can be overwritten to be compatible
        with first generation CAPICE.
        :param dataset: pandas DataFrame
        :return: XGBoost workable data
        """
        return dataset[self.model_features]

    def _load_model(self):
        """
        Template method to load in the model once supported values are correct.
        :return: pickled model instance
        """
        model = None
        if not self.train:
            with open(self._get_model_loc(), 'rb') as model_file:
                model = pickle.load(model_file)
            self.log.info('Successfully loaded model at: {}'.format(
                self._get_model_loc()))
        self.model = model

    @staticmethod
    @abstractmethod
    def _get_model_loc():
        """
        Template to mark the directory where the model is located.
        Use of os.path.join is required.
        You may use the get_project_root_dir() from utilities if the model is
        within this project directory.
        :return: path-like or None if no model has been created yet.
        """
        pass
예제 #9
0
class InputVersionChecker:
    """
    Class to check the given VEP config argument and file VEP version match.

    Class is self running.
    """
    def __init__(self, config_vep_version: float, file_vep_version: float,
                 config_grch_build: int, file_grch_build: int):
        """
        Class to check the given VEP config argument and
        the header of the VEP file match.
        :param config_vep_version: float,
            config argument for the used VEP version
        :param file_vep_version: flaot,
            config argument for the used GRCh build
        """
        self.config_vep_version = config_vep_version
        self.file_vep_version = file_vep_version
        self.config_grch_build = config_grch_build
        self.file_grch_build = file_grch_build
        self.manager = CapiceManager()
        self.export_vep_version = None
        self.export_grch_build = None
        self.check_match = []
        self.unable_check = []
        self.check_overrule = False
        self.log = Logger().logger
        self._check_all_present()
        if self.check_overrule:
            self._check_overrule()
        self._check_version_match()
        self._set_global_vep_version()
        self._set_global_grch_build()

    def _set_global_vep_version(self):
        """
        Function to provide the CapiceManager with the VEP version to be used
        globally later on in CAPICE.
        """
        self.manager.vep_version = self.export_vep_version
        self.log.info('VEP version set to: {}'.format(self.export_vep_version))

    def _set_global_grch_build(self):
        """
        Function to provide the CapiceManager with the Genome Build version to
        be used globally later on in CAPICE.
        """
        self.manager.grch_build = self.export_grch_build
        self.log.info('GRCh build set to: {}'.format(self.export_grch_build))

    def _check_overrule(self):
        """
        Function called when either the
            VEP version or
            GRCh build
        can not be determined.
        Overrule must be present for both impute and model,
        since it can not determine what file to use without VEP or
        GRCh argument.
        """
        if self.manager.overwrite_impute is False and \
                self.manager.overwrite_model is False:
            error_message = """
            VEP version or GRCh build not specified and both overwrites are not 
            set! Not able to find a correct impute or processing file!
            """.strip()
            self.log.critical(error_message)
            raise InputError(error_message)

    def _check_all_present(self):
        """
        Function to check if both the VEP version and GRCh build are present
        within either the config arguments or within the file.
        """
        dict_of_all_present = {
            'VEP': [self.file_vep_version, self.config_vep_version],
            'GRCh': [self.file_grch_build, self.config_grch_build]
        }
        for type_of_check in dict_of_all_present.keys():
            to_check = dict_of_all_present[type_of_check]
            self._check_individual_argument(to_check=to_check,
                                            type_of_check=type_of_check)

    def _check_individual_argument(self, to_check, type_of_check):
        """
        Function belonging to _check_all_present to check if a VEP version and
        GRCh build can be set globally.
        :param to_check: list
        :param type_of_check: string
        """
        if False in to_check:
            if to_check.count(False) == len(to_check):
                self._turn_on_check_overrule(type_of_check=type_of_check)
            for argument in to_check:
                self._apply_export_version(argument=argument,
                                           type_of_check=type_of_check)
        else:
            self.check_match.append(type_of_check)

    def _turn_on_check_overrule(self, type_of_check):
        """
        Function to turn on the overrule check if no VEP or GRCh arguments are
        passed.
        """
        self.check_overrule = type_of_check
        self.log.warning(
            'Unable to obtain {} version from file or config file!'.format(
                type_of_check))
        self.check_overrule = True

    def _apply_export_version(self, argument, type_of_check):
        """
        Function to set the global VEP version or GRCh build.
        :param argument: int or float
        """
        if argument is not False:
            if type_of_check == 'VEP':
                self.export_vep_version = argument
            else:
                self.export_grch_build = argument

    def _check_version_match(self):
        """
        Function to check if the Config Argument and the file header specified
        VEP versions match.
        If not: use the config argument as form of "overwrite" and warn.
        """
        if len(self.check_match) > 0:
            for check_match in self.check_match:
                if check_match == 'VEP':
                    self._check_vep_match(check_match=check_match)
                elif check_match == 'GRCh':
                    self._check_grch_match(check_match=check_match)

    def _check_vep_match(self, check_match):
        if self.file_vep_version != self.config_vep_version:
            self._raise_version_mismatch(type_of_mismatch=check_match,
                                         version_cla=self.file_vep_version,
                                         version_file=self.config_vep_version)
        else:
            self._raise_version_mismatch(type_of_mismatch=check_match,
                                         match_successful=True)
        self.export_vep_version = self.file_vep_version

    def _check_grch_match(self, check_match):
        if self.config_grch_build != self.file_grch_build:
            self._raise_version_mismatch(type_of_mismatch=check_match,
                                         version_cla=self.config_grch_build,
                                         version_file=self.file_grch_build)
        else:
            self._raise_version_mismatch(type_of_mismatch=check_match,
                                         match_successful=True)
        self.export_grch_build = self.file_grch_build

    def _raise_version_mismatch(self,
                                type_of_mismatch,
                                version_cla=None,
                                version_file=None,
                                match_successful=False):
        if match_successful:
            self.log.info(
                'Successfully matched CLA and file versions for {}.'.format(
                    type_of_mismatch))
        else:
            warning_message = """
            Warning matching {} versions. 
            CLA version supplied: 
            {} does not match file version: {} !""".format(
                type_of_mismatch, version_cla, version_file).strip()
            warnings.warn(warning_message)
            self.log.warning(warning_message)
예제 #10
0
class InputHeaderParser:
    """
    Autonomous class to parse just the header of the input file to get the
    amount of comment lines that pandas should skip when reading.
    """
    def __init__(self, is_gzipped: bool, input_file_loc: str):
        self.log = Logger().logger
        self.manager = CapiceManager()
        self.log.info('Starting to parse input file header.')
        self.is_gzipped = is_gzipped
        self.input_file_loc = input_file_loc
        self.header = ''
        self.header_build = False
        self.header_version = False
        self.header_present = False
        self.file_type = None
        self.skip_rows = 0
        self._parse_header()
        if self.header_present:
            self.log.info(
                "Input file header successfully identified: {}".format(
                    self.header.strip()))
            self._get_file_type()
        else:
            self.log.warning(
                'Unable to parse input file header, header not located. '
                'Does the header start with "##"?')

    def _parse_header(self):
        """
        Class to see if the first line is present within the input file.
        """
        if self.is_gzipped:
            file_handle = gzip.open(self.input_file_loc, mode='rt')
        else:
            file_handle = open(self.input_file_loc, mode='rt')
        for line in file_handle:
            if line.startswith('##'):
                self._check_vep_version(line=line)
                self._add_skip_row(line=line)
            else:
                break
        file_handle.close()

    def _add_skip_row(self, line):
        if self.skip_rows == 0:
            self.header_present = True
            self.header = line
            self.skip_rows += 1
        else:
            self.skip_rows += 1

    def _check_vep_version(self, line):
        if line.startswith('##VEP="'):
            self._parse_vep_version(line)

    def _parse_vep_version(self, line):
        for annotation in line.split(' '):
            if annotation.startswith('##VEP'):
                self.header_version = float(
                    annotation.split('v')[1].split('"')[0])
                self.log.info('Header VEP version identified: {}'.format(
                    self.header_version))
            elif annotation.startswith('assembly'):
                self.header_build = int(annotation.split('h')[1].split('.')[0])
                self.log.info('Header GRCh build identified: {}'.format(
                    self.header_build))

    def _get_file_type(self):
        if not self.header.startswith('## VEP VCF to CAPICE tsv converter'):
            warning_message = 'Unable to recognize origin of input file.'
            self.log.warning(warning_message)
            warnings.warn(warning_message)

    def get_skip_rows(self):
        """
        Function to return the integer value of how many rows pandas.read_csv()
        should skip to reach the data.

        :return: int
        """
        return self.skip_rows

    def get_vep_version(self):
        """
        Function to return the float value of the VEP version used to generate
        the input file.

        :return: float
        """
        return self.header_version

    def get_grch_build(self):
        """
        Function to return the float value of the GRCh build used to generate
        the input file.

        :return: int
        """
        return self.header_build