class Annotator: def __init__(self, dataset: pd.DataFrame): self.log = Logger().logger self.manager = CapiceManager() self.fasta_lookup = FastaLookupAnnotator() self.manual_annotater = ManualAnnotator() self.dataset = dataset def annotate(self): """ Start the annotation process. :return: pandas dataframe """ self.log.info('Starting manual annotation process.') self._add_sequence() self.dataset = self.manual_annotater.process(dataset=self.dataset) self.log.info('Annotation successful.') self.log.debug( 'Final shape of the annotated data: {}'.format(self.dataset.shape) ) return self.dataset def _add_sequence(self): self.log.debug('Annotation addition: sequence') self.dataset['Seq'] = self.dataset.apply( lambda x: self.fasta_lookup.get_reference_sequence( chromosome=x['Chr'], start=x['Pos'] - 75, end=x['Pos'] + 75 ), axis=1 ) self.fasta_lookup.close_connection()
class FastaLookupAnnotator: def __init__(self): self.log = Logger().logger self.manager = CapiceManager() self.fasta_loc = self.manager.reference_genome self.fasta = None self._load_fasta() def _load_fasta(self): self.log.info('Loading in Fasta file, this may take a moment.') self.fasta = pysam.FastaFile(self.fasta_loc) self.log.info('Succesfully loaded Fasta file at: {}'.format( self.fasta_loc)) def get_reference_sequence(self, chromosome: str, start: int, end: int): """ Function to obtain a sequence from the reference Fasta file. :param chromosome: string, chromosome to get the reference sequence from. :param start: Chromosomal position at what point the sequence should be obtained. :param end: Chromosomal position at what point the obtained sequence should end. :return: string, obtained reference sequence. """ try: self.log.debug('Obtaining reference sequence for: ' '[Chromosome: {}], [start: {}], [stop: {}]'.format( chromosome, start, end)) append_ns = False if start < 0: append_ns = abs(start) start = 0 return_sequence = self.fasta.fetch(chromosome, start, end) if append_ns: return_sequence = '{}{}'.format('N' * append_ns, return_sequence) return return_sequence except KeyError: self.log.warning( 'Unable to obtain sequence for: ' '[Chromosome: {}], [start: {}], [stop: {}],' 'did you supply a reference with contigs 1-22 + x,y,mt?'. format(chromosome, start, end)) return None def close_connection(self): """ Function to tell pysam to close the connection to the Fasta file """ if self.fasta: self.fasta.close()
class LoadFilePreProcessor: def __init__(self, dataset: pd.DataFrame): self.dataset = dataset self.log = Logger().logger def process(self): """ Function to start the LoadFilePreProcessor to correct the input file of each column starting with % and the renaming of certain columns, like #CHROM to Chr. Returns ------- dataset : pandas.DataFrame Processed dataset with corrected % sign and renamed columns. """ self.log.debug('Starting correcting % sign.') self._correct_percentage_sign() self.log.debug('% sign corrected, starting renaming of columns.') self._col_renamer() self.dataset['Chr'] = self.dataset['Chr'].astype(str) self.log.info('LoadFilePreProcessor successful.') return self.dataset def _correct_percentage_sign(self): new_columns = [] for column in self.dataset.columns: if column.startswith('%'): new_columns.append(column.split('%')[1]) elif column.startswith('#'): new_columns.append(column.split('#')[1]) else: new_columns.append(column) self.dataset.columns = new_columns def _col_renamer(self): """ Function to rename "Gene, Feature, SYMBOL, INTRON and EXON" to "GeneID, FeatureID, GeneName, Intron and Exon". """ self.dataset.rename(columns={ 'CHROM': 'Chr', 'POS': 'Pos', 'REF': 'Ref', 'ALT': 'Alt', 'SYMBOL_SOURCE': 'SourceID', 'Feature': 'FeatureID', 'SYMBOL': 'GeneName', 'INTRON': 'Intron', 'EXON': 'Exon' }, inplace=True)
class CapiceImputing: """ Class to dynamically load in all imputing files and identify the file suitable for the run's use case. """ def __init__(self): self.manager = CapiceManager() self.vep_version = self.manager.vep_version self.grch_build = self.manager.grch_build self.log = Logger().logger self.log.info('Imputer started.') self.overrule = self.manager.overwrite_impute self.modules = [] self.module = None self._load_modules() self._is_correct_datafile_present() self._check_if_imputer_is_applied() self.columns = [] self.annotation_columns_present = [] self.impute_values = {} self.pre_dtypes = {} self.dtypes = {} def _load_modules(self): """ Method to dynamically load in all python files containing a class that contains the properties name and _json_name. If at the end of this function, the list of impute files is empty, will throw the module not found error. """ self.log.info('Identifying imputing files.') directory = os.path.join(get_project_root_dir(), 'src', 'main', 'python', 'resources', 'data_files', 'imputing') usable_modules = load_modules(directory) if len(usable_modules) < 1: self._raise_no_module_found_error() loaded_modules = importer(usable_modules=usable_modules, path=directory) for module in loaded_modules: if "name" in dir(module) and "_json_name" in dir(module): self.modules.append(module) if len(self.modules) < 1: self._raise_no_module_found_error() self.log.info( 'Identified {} files available for usage in imputing.'.format( len(self.modules))) def _raise_no_module_found_error(self): """ Function to raise when no suitable impute files are found. Put into a function since 2 other functions within this module will use it. """ error_message = 'No usable python files are found ' \ 'within the imputing directory!' self.log.critical(error_message) raise FileNotFoundError(error_message) def _is_correct_datafile_present(self): """ Function to check the VEP version and GRCh build (or --overwrite_impute_file) match the impute file. """ for module in self.modules: if self.overrule and module.name == self.overrule: self.log.info( 'Overrule successful for: {} , located at: {}'.format( self.overrule, inspect.getfile(module.__class__))) self.module = module break else: module_vep_version = module.supported_vep_version module_grch_build = module.supported_grch_build if module_vep_version == self.vep_version and \ module_grch_build == self.grch_build: self.log.info('Impute data file successfully found: {} , ' 'located at: {}'.format( module.name, inspect.getfile(module.__class__))) self.module = module break def _check_if_imputer_is_applied(self): # Checking if self.data_file is assigned if self.module is None: if self.overrule: error_message = 'No imputing data file found for overrule: ' \ '{}'.format(self.overrule) else: error_message = 'No imputing data file found for ' \ 'VEP version: {} and ' \ 'GRCh build: {}'.format(self.vep_version, self.grch_build ) self.log.critical(error_message) raise FileNotFoundError(error_message) def _load_values(self, dataset: pd.DataFrame): """ Function to be called right when impute() is called, gets the input datafile features, imputes values from the impute file and saves the datafile features to the manager. """ self.columns = self.module.annotation_features for col in self.columns: if col in dataset.columns: self.annotation_columns_present.append(col) self.manager.annotation_features = self.columns self.impute_values = self.module.impute_values def impute(self, datafile: pd.DataFrame): """ Function to call the CapiceImputing to start imputing. :return: pandas DataFrame """ self._load_values(datafile) datafile = self._check_chrom_pos(datafile) self._get_nan_ratio_per_column(dataset=datafile) self._get_full_nan_row(dataset=datafile) datafile.dropna(how='all', subset=self.annotation_columns_present) datafile = datafile[~datafile['CAPICE_drop_out']] datafile.drop(columns=['CAPICE_drop_out'], inplace=True) self._correct_dtypes(datafile=datafile) datafile.fillna(self.impute_values, inplace=True) datafile = datafile.astype(dtype=self.pre_dtypes, copy=False) datafile = datafile.astype(dtype=self.dtypes, copy=False) datafile = self._add_missing_columns(datafile) self.log.info('Imputing successfully performed.') return datafile @deprecated def _add_missing_columns(self, datafile: pd.DataFrame): for key, value in self.impute_values.items(): if key not in datafile.columns: datafile[key] = value return datafile def _correct_dtypes(self, datafile: pd.DataFrame): """ Function to correct the dtypes that originate from the lookup annotator according to the dtypes specified within the data json. """ for key, item in self.impute_values.items(): if key in datafile.columns: self._save_dtypes(key=key, item=item) def _save_dtypes(self, key, item): if isinstance(item, int): self.pre_dtypes[key] = float else: self.pre_dtypes[key] = type(item) self.dtypes[key] = type(item) def _check_chrom_pos(self, dataset: pd.DataFrame): """ Function to check if all values of the columns Chr and Pos are present. :param dataset: not imputed pandas DataFrame :return: pandas DataFrame containing no NaN or gaps for Chr and Pos columns. """ chrom_is_float = False if dataset['Chr'].isnull().values.any(): if dataset.dtypes['Chr'] == np.float64: chrom_is_float = True n_delete = dataset['Chr'].isnull().values.sum() self.log.warning('Detected NaN in the Chromosome column! ' 'Deleting {} row(s).'.format(n_delete)) dataset = dataset[~dataset['Chr'].isnull()] if dataset['Pos'].isnull().values.any(): n_delete = dataset['Pos'].isnull().values.sum() self.log.warning('Detected NaN is the Position column! ' 'Deleting {} row(s).'.format(n_delete)) dataset = dataset[~dataset['Pos'].isnull()] dataset.index = range(0, dataset.shape[0]) if chrom_is_float: dataset['Chr'] = dataset['Chr'].astype(int) dataset['Chr'] = dataset['Chr'].astype(str) dataset['Pos'] = dataset['Pos'].astype(int) return dataset def _get_nan_ratio_per_column(self, dataset: pd.DataFrame): """ Generic function to get the percentage of gaps per column :param dataset: not imputed pandas DataFrame """ for column in dataset.columns: series = dataset[column] self._calculate_percentage_nan(column=series) def _calculate_percentage_nan(self, column): n_nan = column.isnull().sum() if n_nan > 0: n_samples = column.size p_nan = round((n_nan / n_samples) * 100, ndigits=2) self.log.debug( 'NaN detected in column {}, percentage: {}%.'.format( column.name, p_nan)) def _get_full_nan_row(self, dataset: pd.DataFrame): """ Function to get the samples of which absolutely no prediction is possible due to all non chr pos ref alt rows being gaps. :param dataset: not imputed pandas DataFrame """ n_samples = dataset.shape[0] dataset.index = range(1, n_samples + 1) dataset['CAPICE_drop_out'] = dataset[ self.annotation_columns_present].isnull().values.all(axis=1) samples_dropped_out = dataset[dataset['CAPICE_drop_out']] if samples_dropped_out.shape[0] > 0: self.log.warning( 'The following samples are filtered out due to missing values: ' '(indexing is python based, ' 'so the index starts at 0). \n {}'.format(samples_dropped_out[[ 'Chr', 'Pos', 'Ref', 'Alt', 'FeatureID' ]])) else: self.log.info( 'No samples are filtered out due to too many NaN values.')
class TemplateSetup(metaclass=ABCMeta): """ Abstract class to act as template for new models that might be added in future patches of CAPICE. Contains the necessary steps for preprocessing as well. """ def __init__(self, name, usable, vep_version, grch_build): self.log = Logger().logger self.property_checker = PropertyCheckerLogger() self.name = name self.usable = usable self.supported_vep_version = vep_version self.supported_grch_build = grch_build self.annotation_features = CapiceManager().annotation_features self.train = False self.model = None self.annotation_object = [] self.model_features = None @property def name(self): """ Property getter name, to get the init defined name of the model module. :return: str """ return self._name @name.setter def name(self, value='Template'): """ Property setter name, to set a name for a model module. Raises TypeError if not supplied with a string. :param value: str """ self.property_checker.check_property(value=value, expected_type=str) self._name = value @property def usable(self): """ Property getter usable, to get the boolean value of a model module whenever it can be used for preprocessing and prediction. :return: bool """ return self._usable @usable.setter def usable(self, value=False): """ Property setter usable, to set the boolean value of a model module whenever it should be used for preprocessing and prediction. Raises TypeError if not supplied with a boolean. :param value: bool """ self.property_checker.check_property(value=value, expected_type=bool) self._usable = value @property def supported_vep_version(self): """ Property getter supported_annotation_version, to get the float annotation_version value of a model/prediction file that is supported within the module. :return: float or None """ return self._vep_version @supported_vep_version.setter def supported_vep_version(self, value): """ Property setter supported_annotation_version, to set the float annotation_version value of a model/prediction file that is supported within the module. Raises TypeError if not supplied with a float or None. :param value: float or None """ self.property_checker.check_property( value=value, expected_type=float, include_none=True ) self._vep_version = value @property def supported_grch_build(self): """ Property getter supported_grch_build, to get the integer grch_build value that defines what genome build is supported by the model/prediction module. :return: integer or None """ return self._grch_build @supported_grch_build.setter def supported_grch_build(self, value): """ Property getter supported_grch_build, to set the integer value grch_build that defines what genome build is supported by the model/prediction module. Raises TypeError if not supplied with an integer or None. :param value: integer or None """ self.property_checker.check_property( value=value, expected_type=int, include_none=True ) self._grch_build = value def preprocess(self, dataset: pd.DataFrame, is_train: bool): """ Callable function to start the preprocessing of a dataset. :param dataset: imputed pandas DataFrame :param is_train: boolean :return: processed pandas DataFrame """ self.train = is_train self._load_model() if not self.train: self._load_model_features() dataset = self._duplicate_chr_pos_ref_alt(dataset=dataset) self._get_categorical_columns(dataset=dataset) processed_dataset = self._process_objects(dataset=dataset) if not self.train: processed_dataset = self._check_all_model_features_present( processed_dataset ) self.log.info('Successfully preprocessed data.') return processed_dataset @deprecated def _check_all_model_features_present(self, dataset: pd.DataFrame): for feature in self.model_features: if feature not in dataset.columns: dataset[feature] = 0 return dataset def _get_categorical_columns(self, dataset: pd.DataFrame): """ Function to get the categorical columns that are within the supplied annotation features of the imputing file. :param dataset: pandas DataFrame """ for feature in dataset.select_dtypes(include=["O"]).columns: if feature in self.annotation_features: self.annotation_object.append(feature) self.log.debug( 'Converting the categorical columns: {}.'.format( ", ".join(self.annotation_object) ) ) @staticmethod def _duplicate_chr_pos_ref_alt(dataset): """ Function to create the chr_pos_ref_alt column so that it doesn't get lost in preprocessing. :param dataset: unprocessed pandas DataFrame :return: unprocessed pandas DataFrame containing column 'chr_pos_ref_alt' """ dataset['chr_pos_ref_alt'] = dataset[ ['Chr', 'Pos', 'Ref', 'Alt']].astype(str).agg('_'.join, axis=1) return dataset @property def model_features(self): return self._model_features @model_features.setter def model_features(self, value): self._model_features = value def _process_objects(self, dataset: pd.DataFrame): """ (If train) will create a dictionary telling the processor how many categories are within a certain column. If not train: Will look up each annotation feature from the impute file within the columns of the datafile (either in full name or the column starts with the feature from the impute file). This dictionary is then passed to the actual processor. :param dataset: unprocessed pandas DataFrame :return: processed pandas DataFrame """ annotation_feats_dict = {} if self.train: hardcoded_features = ['Ref', 'Alt', 'Domain'] for feature in hardcoded_features: annotation_feats_dict[feature] = 5 self.log.info( 'Training protocol, ' 'creating new categorical conversion identifiers.' ) for feat in self.annotation_object: if feat not in annotation_feats_dict.keys(): annotation_feats_dict[feat] = 5 else: for feature in self.annotation_object: annotation_feats_dict = self._process_objects_no_train( feature=feature, annotation_features_dict=annotation_feats_dict ) processed_data = self._process_categorical_vars( dataset=dataset, annotation_feats_dict=annotation_feats_dict ) return processed_data def _process_objects_no_train(self, feature: str, annotation_features_dict: dict): for model_feature in self.model_features: if model_feature.startswith(feature): extension = model_feature.split(''.join([feature, '_']))[-1] if feature in annotation_features_dict.keys(): annotation_features_dict[feature].append(extension) else: annotation_features_dict[feature] = [extension] return annotation_features_dict def _load_model_features(self): """ Function to access the protected member of the XGBoost _Booster class to get the features that the model is trained on. :return: list """ self.log.info('Using features saved within the model.') self.model_features = self.model._Booster.feature_names def _process_categorical_vars(self, dataset: pd.DataFrame, annotation_feats_dict: dict): """ Processor of categorical columns. Will create new columns based on the quantity of a value within a column. :param dataset: unprocessed pandas DataFrame :param annotation_feats_dict: dictionary that is to contain the levels for each categorical feature :return: processed pandas DataFrame """ if self.train: for annotation_feature in annotation_feats_dict.keys(): feature_names = self._get_top10_or_less_cats( column=dataset[annotation_feature], return_num=annotation_feats_dict[annotation_feature] ) dataset[annotation_feature] = np.where( dataset[annotation_feature].isin(feature_names), dataset[annotation_feature], 'other') else: for annotation_feature in annotation_feats_dict.keys(): feature_names = annotation_feats_dict[annotation_feature] self.log.debug('For feature: {} loaded {} levels: {}'.format( annotation_feature, len(feature_names), feature_names )) dataset[annotation_feature] = np.where( dataset[annotation_feature].isin(feature_names), dataset[annotation_feature], 'other' ) dataset = pd.get_dummies( dataset, columns=list(annotation_feats_dict.keys()) ) # Checking if all annotation features are processed. # If not, add a column containing all "false" (0) for annotation_feature in annotation_feats_dict.keys(): dataset = self._check_all_annotation_features_processed( current_annotation_feature=annotation_feature, dataset=dataset, annotation_features_dict=annotation_feats_dict ) return dataset def _check_all_annotation_features_processed(self, current_annotation_feature, dataset: pd.DataFrame, annotation_features_dict): if not self.train: afd = annotation_features_dict for processed_feature in afd[current_annotation_feature]: col_be_present = "_".join( [current_annotation_feature, processed_feature]) if col_be_present not in dataset.columns: self.log.warning( 'Of annotation feature {},' ' detected {} not present in columns.'.format( current_annotation_feature, processed_feature)) dataset[col_be_present] = 0 return dataset def _get_top10_or_less_cats(self, column: pd.Series, return_num: int): """ Function for when a training file is preprocessed to get the top return_num quantity values within a categorical column. Some converting is done for the logger to be able to print them. :param column: pandas Series :param return_num: integer :return: pandas Series """ value_counts = column.value_counts().index[:return_num].values printable_value_counts = [] for value in value_counts: if not isinstance(value, str): value = str(value) printable_value_counts.append(value) self.log.info('For feature: {} saved the following values: {}'.format( column.name, ', '.join(printable_value_counts) )) return value_counts # Model stuff def predict(self, data: pd.DataFrame): """ Function to load the model and predict the CAPICE scores. Can be overwritten in case of legacy support. :return: pandas DataFrame """ self.log.info('Predicting for {} samples.'.format(data.shape[0])) self._load_model() self._load_model_features() data['probabilities'] = self._predict( self._create_input_matrix(dataset=data)) self.log.info('Predicting successful.') return data def _predict(self, predict_data): """ Further down defined prediction function, which is different for XGBoost 0.72.1 and current XGBoost version. :param predict_data: preprocessed pandas DataFrame :return: numpy array """ return self.model.predict_proba(predict_data)[:, 1] def _create_input_matrix(self, dataset: pd.DataFrame): """ Also a template function, which can be overwritten to be compatible with first generation CAPICE. :param dataset: pandas DataFrame :return: XGBoost workable data """ return dataset[self.model_features] def _load_model(self): """ Template method to load in the model once supported values are correct. :return: pickled model instance """ model = None if not self.train: with open(self._get_model_loc(), 'rb') as model_file: model = pickle.load(model_file) self.log.info('Successfully loaded model at: {}'.format( self._get_model_loc())) self.model = model @staticmethod @abstractmethod def _get_model_loc(): """ Template to mark the directory where the model is located. Use of os.path.join is required. You may use the get_project_root_dir() from utilities if the model is within this project directory. :return: path-like or None if no model has been created yet. """ pass