Пример #1
0
class Annotator:
    def __init__(self, dataset: pd.DataFrame):
        self.log = Logger().logger
        self.manager = CapiceManager()
        self.fasta_lookup = FastaLookupAnnotator()
        self.manual_annotater = ManualAnnotator()
        self.dataset = dataset

    def annotate(self):
        """
        Start the annotation process.
        :return: pandas dataframe
        """
        self.log.info('Starting manual annotation process.')
        self._add_sequence()
        self.dataset = self.manual_annotater.process(dataset=self.dataset)
        self.log.info('Annotation successful.')
        self.log.debug(
            'Final shape of the annotated data: {}'.format(self.dataset.shape)
        )
        return self.dataset

    def _add_sequence(self):
        self.log.debug('Annotation addition: sequence')
        self.dataset['Seq'] = self.dataset.apply(
            lambda x: self.fasta_lookup.get_reference_sequence(
                chromosome=x['Chr'],
                start=x['Pos'] - 75,
                end=x['Pos'] + 75
            ), axis=1
        )
        self.fasta_lookup.close_connection()
Пример #2
0
class FastaLookupAnnotator:
    def __init__(self):
        self.log = Logger().logger
        self.manager = CapiceManager()
        self.fasta_loc = self.manager.reference_genome
        self.fasta = None
        self._load_fasta()

    def _load_fasta(self):
        self.log.info('Loading in Fasta file, this may take a moment.')
        self.fasta = pysam.FastaFile(self.fasta_loc)
        self.log.info('Succesfully loaded Fasta file at: {}'.format(
            self.fasta_loc))

    def get_reference_sequence(self, chromosome: str, start: int, end: int):
        """
        Function to obtain a sequence from the reference Fasta file.

        :param chromosome: string,
            chromosome to get the reference sequence from.
        :param start: Chromosomal position at what point the sequence
        should be obtained.

        :param end: Chromosomal position at what point the obtained sequence
        should end.

        :return: string, obtained reference sequence.
        """
        try:
            self.log.debug('Obtaining reference sequence for: '
                           '[Chromosome: {}], [start: {}], [stop: {}]'.format(
                               chromosome, start, end))
            append_ns = False
            if start < 0:
                append_ns = abs(start)
                start = 0
            return_sequence = self.fasta.fetch(chromosome, start, end)
            if append_ns:
                return_sequence = '{}{}'.format('N' * append_ns,
                                                return_sequence)
            return return_sequence
        except KeyError:
            self.log.warning(
                'Unable to obtain sequence for: '
                '[Chromosome: {}], [start: {}], [stop: {}],'
                'did you supply a reference with contigs 1-22 + x,y,mt?'.
                format(chromosome, start, end))
            return None

    def close_connection(self):
        """
        Function to tell pysam to close the connection to the Fasta file
        """
        if self.fasta:
            self.fasta.close()
Пример #3
0
class LoadFilePreProcessor:
    def __init__(self, dataset: pd.DataFrame):
        self.dataset = dataset
        self.log = Logger().logger

    def process(self):
        """
        Function to start the LoadFilePreProcessor to correct the input file of
        each column starting with % and the renaming of certain columns,
        like #CHROM to Chr.

        Returns
        -------
        dataset :   pandas.DataFrame
                    Processed dataset with corrected % sign and renamed columns.
        """
        self.log.debug('Starting correcting % sign.')
        self._correct_percentage_sign()
        self.log.debug('% sign corrected, starting renaming of columns.')
        self._col_renamer()
        self.dataset['Chr'] = self.dataset['Chr'].astype(str)
        self.log.info('LoadFilePreProcessor successful.')
        return self.dataset

    def _correct_percentage_sign(self):
        new_columns = []
        for column in self.dataset.columns:
            if column.startswith('%'):
                new_columns.append(column.split('%')[1])
            elif column.startswith('#'):
                new_columns.append(column.split('#')[1])
            else:
                new_columns.append(column)
        self.dataset.columns = new_columns

    def _col_renamer(self):
        """
        Function to rename "Gene, Feature, SYMBOL, INTRON and EXON" to
        "GeneID, FeatureID, GeneName, Intron and Exon".
        """
        self.dataset.rename(columns={
            'CHROM': 'Chr',
            'POS': 'Pos',
            'REF': 'Ref',
            'ALT': 'Alt',
            'SYMBOL_SOURCE': 'SourceID',
            'Feature': 'FeatureID',
            'SYMBOL': 'GeneName',
            'INTRON': 'Intron',
            'EXON': 'Exon'
        },
                            inplace=True)
Пример #4
0
class CapiceImputing:
    """
    Class to dynamically load in all imputing files and identify the file
    suitable for the run's use case.
    """
    def __init__(self):
        self.manager = CapiceManager()
        self.vep_version = self.manager.vep_version
        self.grch_build = self.manager.grch_build
        self.log = Logger().logger
        self.log.info('Imputer started.')
        self.overrule = self.manager.overwrite_impute
        self.modules = []
        self.module = None
        self._load_modules()
        self._is_correct_datafile_present()
        self._check_if_imputer_is_applied()
        self.columns = []
        self.annotation_columns_present = []
        self.impute_values = {}
        self.pre_dtypes = {}
        self.dtypes = {}

    def _load_modules(self):
        """
        Method to dynamically load in all python files containing a class that
        contains the properties
            name and
            _json_name.
        If at the end of this function, the list of impute files is empty,
        will throw the module not found error.
        """
        self.log.info('Identifying imputing files.')
        directory = os.path.join(get_project_root_dir(), 'src', 'main',
                                 'python', 'resources', 'data_files',
                                 'imputing')
        usable_modules = load_modules(directory)
        if len(usable_modules) < 1:
            self._raise_no_module_found_error()
        loaded_modules = importer(usable_modules=usable_modules,
                                  path=directory)
        for module in loaded_modules:
            if "name" in dir(module) and "_json_name" in dir(module):
                self.modules.append(module)
        if len(self.modules) < 1:
            self._raise_no_module_found_error()
        self.log.info(
            'Identified {} files available for usage in imputing.'.format(
                len(self.modules)))

    def _raise_no_module_found_error(self):
        """
        Function to raise when no suitable impute files are found.
        Put into a function since 2 other functions within this module will use
        it.
        """
        error_message = 'No usable python files are found ' \
                        'within the imputing directory!'
        self.log.critical(error_message)
        raise FileNotFoundError(error_message)

    def _is_correct_datafile_present(self):
        """
        Function to check the VEP version and GRCh build
        (or --overwrite_impute_file)
        match the impute file.
        """
        for module in self.modules:
            if self.overrule and module.name == self.overrule:
                self.log.info(
                    'Overrule successful for: {} , located at: {}'.format(
                        self.overrule, inspect.getfile(module.__class__)))
                self.module = module
                break
            else:
                module_vep_version = module.supported_vep_version
                module_grch_build = module.supported_grch_build
                if module_vep_version == self.vep_version and \
                        module_grch_build == self.grch_build:
                    self.log.info('Impute data file successfully found: {} , '
                                  'located at: {}'.format(
                                      module.name,
                                      inspect.getfile(module.__class__)))
                    self.module = module
                    break

    def _check_if_imputer_is_applied(self):
        # Checking if self.data_file is assigned
        if self.module is None:
            if self.overrule:
                error_message = 'No imputing data file found for overrule: ' \
                                '{}'.format(self.overrule)
            else:
                error_message = 'No imputing data file found for ' \
                                'VEP version: {} and ' \
                                'GRCh build: {}'.format(self.vep_version,
                                                        self.grch_build
                                                        )
            self.log.critical(error_message)
            raise FileNotFoundError(error_message)

    def _load_values(self, dataset: pd.DataFrame):
        """
        Function to be called right when impute() is called,
        gets the input datafile features,
        imputes values from the impute file and
        saves the datafile features to the manager.
        """
        self.columns = self.module.annotation_features
        for col in self.columns:
            if col in dataset.columns:
                self.annotation_columns_present.append(col)
        self.manager.annotation_features = self.columns
        self.impute_values = self.module.impute_values

    def impute(self, datafile: pd.DataFrame):
        """
        Function to call the CapiceImputing to start imputing.
        :return: pandas DataFrame
        """
        self._load_values(datafile)
        datafile = self._check_chrom_pos(datafile)
        self._get_nan_ratio_per_column(dataset=datafile)
        self._get_full_nan_row(dataset=datafile)
        datafile.dropna(how='all', subset=self.annotation_columns_present)
        datafile = datafile[~datafile['CAPICE_drop_out']]
        datafile.drop(columns=['CAPICE_drop_out'], inplace=True)
        self._correct_dtypes(datafile=datafile)
        datafile.fillna(self.impute_values, inplace=True)
        datafile = datafile.astype(dtype=self.pre_dtypes, copy=False)
        datafile = datafile.astype(dtype=self.dtypes, copy=False)
        datafile = self._add_missing_columns(datafile)
        self.log.info('Imputing successfully performed.')
        return datafile

    @deprecated
    def _add_missing_columns(self, datafile: pd.DataFrame):
        for key, value in self.impute_values.items():
            if key not in datafile.columns:
                datafile[key] = value
        return datafile

    def _correct_dtypes(self, datafile: pd.DataFrame):
        """
        Function to correct the dtypes that originate from the lookup annotator
        according to the dtypes specified within the data json.
        """
        for key, item in self.impute_values.items():
            if key in datafile.columns:
                self._save_dtypes(key=key, item=item)

    def _save_dtypes(self, key, item):
        if isinstance(item, int):
            self.pre_dtypes[key] = float
        else:
            self.pre_dtypes[key] = type(item)
        self.dtypes[key] = type(item)

    def _check_chrom_pos(self, dataset: pd.DataFrame):
        """
        Function to check if all values of the columns Chr and Pos are present.
        :param dataset: not imputed pandas DataFrame
        :return: pandas DataFrame
            containing no NaN or gaps for Chr and Pos columns.
        """
        chrom_is_float = False
        if dataset['Chr'].isnull().values.any():
            if dataset.dtypes['Chr'] == np.float64:
                chrom_is_float = True
            n_delete = dataset['Chr'].isnull().values.sum()
            self.log.warning('Detected NaN in the Chromosome column! '
                             'Deleting {} row(s).'.format(n_delete))
            dataset = dataset[~dataset['Chr'].isnull()]
        if dataset['Pos'].isnull().values.any():
            n_delete = dataset['Pos'].isnull().values.sum()
            self.log.warning('Detected NaN is the Position column! '
                             'Deleting {} row(s).'.format(n_delete))
            dataset = dataset[~dataset['Pos'].isnull()]
        dataset.index = range(0, dataset.shape[0])
        if chrom_is_float:
            dataset['Chr'] = dataset['Chr'].astype(int)
            dataset['Chr'] = dataset['Chr'].astype(str)
        dataset['Pos'] = dataset['Pos'].astype(int)
        return dataset

    def _get_nan_ratio_per_column(self, dataset: pd.DataFrame):
        """
        Generic function to get the percentage of gaps per column
        :param dataset: not imputed pandas DataFrame
        """
        for column in dataset.columns:
            series = dataset[column]
            self._calculate_percentage_nan(column=series)

    def _calculate_percentage_nan(self, column):
        n_nan = column.isnull().sum()
        if n_nan > 0:
            n_samples = column.size
            p_nan = round((n_nan / n_samples) * 100, ndigits=2)
            self.log.debug(
                'NaN detected in column {}, percentage: {}%.'.format(
                    column.name, p_nan))

    def _get_full_nan_row(self, dataset: pd.DataFrame):
        """
        Function to get the samples of which absolutely no prediction is
        possible due to all non chr pos ref alt rows being gaps.
        :param dataset: not imputed pandas DataFrame
        """
        n_samples = dataset.shape[0]
        dataset.index = range(1, n_samples + 1)
        dataset['CAPICE_drop_out'] = dataset[
            self.annotation_columns_present].isnull().values.all(axis=1)
        samples_dropped_out = dataset[dataset['CAPICE_drop_out']]
        if samples_dropped_out.shape[0] > 0:
            self.log.warning(
                'The following samples are filtered out due to missing values: '
                '(indexing is python based, '
                'so the index starts at 0). \n {}'.format(samples_dropped_out[[
                    'Chr', 'Pos', 'Ref', 'Alt', 'FeatureID'
                ]]))
        else:
            self.log.info(
                'No samples are filtered out due to too many NaN values.')
Пример #5
0
class TemplateSetup(metaclass=ABCMeta):
    """
    Abstract class to act as template for new models that might be
    added in future patches of CAPICE.
    Contains the necessary steps for preprocessing as well.
    """

    def __init__(self, name, usable, vep_version, grch_build):
        self.log = Logger().logger
        self.property_checker = PropertyCheckerLogger()
        self.name = name
        self.usable = usable
        self.supported_vep_version = vep_version
        self.supported_grch_build = grch_build
        self.annotation_features = CapiceManager().annotation_features
        self.train = False
        self.model = None
        self.annotation_object = []
        self.model_features = None

    @property
    def name(self):
        """
        Property getter name, to get the init defined name of the model module.

        :return: str
        """
        return self._name

    @name.setter
    def name(self, value='Template'):
        """
        Property setter name, to set a name for a model module.
        Raises TypeError if not supplied with a string.

        :param value: str
        """
        self.property_checker.check_property(value=value, expected_type=str)
        self._name = value

    @property
    def usable(self):
        """
        Property getter usable, to get the boolean value of a model module
        whenever it can be used for preprocessing and prediction.

        :return: bool
        """
        return self._usable

    @usable.setter
    def usable(self, value=False):
        """
        Property setter usable, to set the boolean value of a model module
        whenever it should be used for preprocessing and prediction.
        Raises TypeError if not supplied with a boolean.

        :param value: bool
        """
        self.property_checker.check_property(value=value, expected_type=bool)
        self._usable = value

    @property
    def supported_vep_version(self):
        """
        Property getter supported_annotation_version,
        to get the float annotation_version value of a model/prediction file
        that is supported within the module.

        :return: float or None
        """
        return self._vep_version

    @supported_vep_version.setter
    def supported_vep_version(self, value):
        """
        Property setter supported_annotation_version,
        to set the float annotation_version value of a model/prediction file
        that is supported within the module.
        Raises TypeError if not supplied with a float or None.

        :param value: float or None
        """
        self.property_checker.check_property(
            value=value,
            expected_type=float,
            include_none=True
        )
        self._vep_version = value

    @property
    def supported_grch_build(self):
        """
        Property getter supported_grch_build,
        to get the integer grch_build value that defines what genome build
        is supported by the model/prediction module.

        :return: integer or None
        """
        return self._grch_build

    @supported_grch_build.setter
    def supported_grch_build(self, value):
        """
        Property getter supported_grch_build,
        to set the integer value grch_build that defines what genome build
        is supported by the model/prediction module.
        Raises TypeError if not supplied with an integer or None.

        :param value: integer or None
        """
        self.property_checker.check_property(
            value=value,
            expected_type=int,
            include_none=True
        )
        self._grch_build = value

    def preprocess(self, dataset: pd.DataFrame, is_train: bool):
        """
        Callable function to start the preprocessing of a dataset.
        :param dataset: imputed pandas DataFrame
        :param is_train: boolean
        :return: processed pandas DataFrame
        """
        self.train = is_train
        self._load_model()
        if not self.train:
            self._load_model_features()
        dataset = self._duplicate_chr_pos_ref_alt(dataset=dataset)
        self._get_categorical_columns(dataset=dataset)
        processed_dataset = self._process_objects(dataset=dataset)
        if not self.train:
            processed_dataset = self._check_all_model_features_present(
                processed_dataset
            )
        self.log.info('Successfully preprocessed data.')
        return processed_dataset

    @deprecated
    def _check_all_model_features_present(self, dataset: pd.DataFrame):
        for feature in self.model_features:
            if feature not in dataset.columns:
                dataset[feature] = 0
        return dataset

    def _get_categorical_columns(self, dataset: pd.DataFrame):
        """
        Function to get the categorical columns that are within the supplied
        annotation features of the imputing file.
        :param dataset: pandas DataFrame
        """
        for feature in dataset.select_dtypes(include=["O"]).columns:
            if feature in self.annotation_features:
                self.annotation_object.append(feature)
        self.log.debug(
            'Converting the categorical columns: {}.'.format(
                ", ".join(self.annotation_object)
            )
        )

    @staticmethod
    def _duplicate_chr_pos_ref_alt(dataset):
        """
        Function to create the chr_pos_ref_alt column so that it doesn't get
        lost in preprocessing.
        :param dataset: unprocessed pandas DataFrame
        :return: unprocessed pandas DataFrame
            containing column 'chr_pos_ref_alt'
        """
        dataset['chr_pos_ref_alt'] = dataset[
            ['Chr', 'Pos', 'Ref', 'Alt']].astype(str).agg('_'.join, axis=1)
        return dataset

    @property
    def model_features(self):
        return self._model_features

    @model_features.setter
    def model_features(self, value):
        self._model_features = value

    def _process_objects(self, dataset: pd.DataFrame):
        """
        (If train) will create a dictionary telling the processor how many
        categories are within a certain column.
        If not train: Will look up each annotation feature from the impute file
        within the columns of the datafile (either in full name or the column
        starts with the feature from the impute file).
        This dictionary is then passed to the actual processor.
        :param dataset: unprocessed pandas DataFrame
        :return: processed pandas DataFrame
        """
        annotation_feats_dict = {}
        if self.train:
            hardcoded_features = ['Ref', 'Alt', 'Domain']
            for feature in hardcoded_features:
                annotation_feats_dict[feature] = 5
            self.log.info(
                'Training protocol, '
                'creating new categorical conversion identifiers.'
            )
            for feat in self.annotation_object:
                if feat not in annotation_feats_dict.keys():
                    annotation_feats_dict[feat] = 5
        else:
            for feature in self.annotation_object:
                annotation_feats_dict = self._process_objects_no_train(
                    feature=feature,
                    annotation_features_dict=annotation_feats_dict
                )
        processed_data = self._process_categorical_vars(
            dataset=dataset,
            annotation_feats_dict=annotation_feats_dict
        )
        return processed_data

    def _process_objects_no_train(self, feature: str,
                                  annotation_features_dict: dict):
        for model_feature in self.model_features:
            if model_feature.startswith(feature):
                extension = model_feature.split(''.join([feature, '_']))[-1]
                if feature in annotation_features_dict.keys():
                    annotation_features_dict[feature].append(extension)
                else:
                    annotation_features_dict[feature] = [extension]
        return annotation_features_dict

    def _load_model_features(self):
        """
        Function to access the protected member of the XGBoost _Booster class
        to get the features that the model is trained on.
        :return: list
        """
        self.log.info('Using features saved within the model.')
        self.model_features = self.model._Booster.feature_names

    def _process_categorical_vars(self,
                                  dataset: pd.DataFrame,
                                  annotation_feats_dict: dict):
        """
        Processor of categorical columns. Will create new columns based on the
        quantity of a value within a column.
        :param dataset: unprocessed pandas DataFrame
        :param annotation_feats_dict:
            dictionary that is to contain the levels for each categorical
            feature
        :return: processed pandas DataFrame
        """
        if self.train:
            for annotation_feature in annotation_feats_dict.keys():
                feature_names = self._get_top10_or_less_cats(
                    column=dataset[annotation_feature],
                    return_num=annotation_feats_dict[annotation_feature]
                )
                dataset[annotation_feature] = np.where(
                    dataset[annotation_feature].isin(feature_names),
                    dataset[annotation_feature],
                    'other')
        else:
            for annotation_feature in annotation_feats_dict.keys():
                feature_names = annotation_feats_dict[annotation_feature]
                self.log.debug('For feature: {} loaded {} levels: {}'.format(
                    annotation_feature,
                    len(feature_names),
                    feature_names
                ))
                dataset[annotation_feature] = np.where(
                    dataset[annotation_feature].isin(feature_names),
                    dataset[annotation_feature],
                    'other'
                )
        dataset = pd.get_dummies(
            dataset,
            columns=list(annotation_feats_dict.keys())
        )

        # Checking if all annotation features are processed.
        # If not, add a column containing all "false" (0)
        for annotation_feature in annotation_feats_dict.keys():
            dataset = self._check_all_annotation_features_processed(
                current_annotation_feature=annotation_feature,
                dataset=dataset,
                annotation_features_dict=annotation_feats_dict
            )

        return dataset

    def _check_all_annotation_features_processed(self,
                                                 current_annotation_feature,
                                                 dataset: pd.DataFrame,
                                                 annotation_features_dict):
        if not self.train:
            afd = annotation_features_dict
            for processed_feature in afd[current_annotation_feature]:
                col_be_present = "_".join(
                    [current_annotation_feature, processed_feature])
                if col_be_present not in dataset.columns:
                    self.log.warning(
                        'Of annotation feature {},'
                        ' detected {} not present in columns.'.format(
                            current_annotation_feature, processed_feature))
                    dataset[col_be_present] = 0
        return dataset

    def _get_top10_or_less_cats(self, column: pd.Series, return_num: int):
        """
        Function for when a training file is preprocessed to get the top
        return_num quantity values within a categorical column.
        Some converting is done for the logger to be able to print them.
        :param column: pandas Series
        :param return_num: integer
        :return: pandas Series
        """
        value_counts = column.value_counts().index[:return_num].values
        printable_value_counts = []
        for value in value_counts:
            if not isinstance(value, str):
                value = str(value)
            printable_value_counts.append(value)
        self.log.info('For feature: {} saved the following values: {}'.format(
            column.name,
            ', '.join(printable_value_counts)
        ))
        return value_counts

    # Model stuff

    def predict(self, data: pd.DataFrame):
        """
        Function to load the model and predict the CAPICE scores.
        Can be overwritten in case of legacy support.
        :return: pandas DataFrame
        """
        self.log.info('Predicting for {} samples.'.format(data.shape[0]))
        self._load_model()
        self._load_model_features()
        data['probabilities'] = self._predict(
            self._create_input_matrix(dataset=data))
        self.log.info('Predicting successful.')
        return data

    def _predict(self, predict_data):
        """
        Further down defined prediction function, which is different for
        XGBoost 0.72.1 and current XGBoost version.
        :param predict_data: preprocessed pandas DataFrame
        :return: numpy array
        """
        return self.model.predict_proba(predict_data)[:, 1]

    def _create_input_matrix(self, dataset: pd.DataFrame):
        """
        Also a template function, which can be overwritten to be compatible
        with first generation CAPICE.
        :param dataset: pandas DataFrame
        :return: XGBoost workable data
        """
        return dataset[self.model_features]

    def _load_model(self):
        """
        Template method to load in the model once supported values are correct.
        :return: pickled model instance
        """
        model = None
        if not self.train:
            with open(self._get_model_loc(), 'rb') as model_file:
                model = pickle.load(model_file)
            self.log.info('Successfully loaded model at: {}'.format(
                self._get_model_loc()))
        self.model = model

    @staticmethod
    @abstractmethod
    def _get_model_loc():
        """
        Template to mark the directory where the model is located.
        Use of os.path.join is required.
        You may use the get_project_root_dir() from utilities if the model is
        within this project directory.
        :return: path-like or None if no model has been created yet.
        """
        pass