def get_reduced_dataframe_from_saved_classifier() -> pd.DataFrame: """ Returns reduced dataframe given that a classifier is stored as classifier.sav """ clf = FeatureExtractor.get_saved_classifier() selector = SelectFromModel(clf, prefit=True) cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve() assert cache.is_dir() from idiva.fextr import align with open(str(cache) + "/control_v2.vcf") as ctrl_vcf: ctrl_reader = ReadVCF(ctrl_vcf) with open(str(cache) + "/case_processed_v2.vcf") as case_vcf: case_reader = ReadVCF(case_vcf) dataframe = align(ctrl=ctrl_reader, case=case_reader) id = dataframe.index dataframe['ID'] = dataframe.ID_case.combine_first(dataframe.ID_ctrl) dataframe = dataframe[['CHROM', 'POS', 'ID', 'REF', 'ALT']] extracted = id[selector.get_support()].values return dataframe.loc[extracted]
def feature_extraction_chunks(self, ctrl_vcf_file: str, case_vcf_file: str): """ Returns a fitted Perceptron classifier for the given vcf files The classifier is trained in chunks where the chunks consist of a range of patient Therefore the classifier iterates columnwise over the vcf files The files are divided into equally many chunks and therefore the individual chunksize can differ """ log.info("Fit linear classifier and reduce number of variants") clf = Perceptron() cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve() assert cache.is_dir() # create unique index id = None with open(str(cache) + "/" + ctrl_vcf_file) as ctrl_vcf: ctrl_reader = ReadVCF(ctrl_vcf) with open(str(cache) + "/" + case_vcf_file) as case_vcf: case_reader = ReadVCF(case_vcf) dataframe = align(ctrl=ctrl_reader, case=case_reader) id = dataframe.index with open(str(cache) + "/" + ctrl_vcf_file) as ctrl_vcf: with open(str(cache) + "/" + case_vcf_file) as case_vcf: reader_ctrl = ReadVCF(ctrl_vcf) reader_case = ReadVCF(case_vcf) header_ctrl = reader_ctrl.header header_case = reader_case.header exclude = [2, 3, 5, 6, 7, 8] names_ctrl = [i for idx, i in enumerate(header_ctrl) if idx not in exclude] names_case = [i for idx, i in enumerate(header_case) if idx not in exclude] len_ctrl = len(header_ctrl) - 9 len_case = len(header_case) - 9 min_batch_size = min([len_ctrl, len_case, 50]) number_of_batches = int(max([np.ceil(len_ctrl / min_batch_size), np.ceil(len_case / min_batch_size)])) batch_size_ctrl = int(np.ceil(len_ctrl / number_of_batches)) batch_size_case = int(np.ceil(len_case / number_of_batches)) batches_ctrl = [i * batch_size_ctrl for i in range(number_of_batches)] batches_case = [i * batch_size_case for i in range(number_of_batches)] batches_ctrl.append(len_ctrl) batches_case.append(len_case) for idx in tqdm(range(number_of_batches), total=number_of_batches, postfix='feature selection'): clf = self.feature_extraction_batch(reader_ctrl, reader_case, names_ctrl, names_case, batches_ctrl, batches_case, idx, clf, id) return clf, id
def get_reduced_dataframe(self) -> pd.DataFrame: """ Returns reduced dataframe """ cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve() assert cache.is_dir() from idiva.fextr import align with open(str(cache) + "/control_v2.vcf") as ctrl_vcf: ctrl_reader = ReadVCF(ctrl_vcf) with open(str(cache) + "/case_processed_v2.vcf") as case_vcf: case_reader = ReadVCF(case_vcf) dataframe = align(ctrl=ctrl_reader, case=case_reader) dataframe['ID'] = dataframe.ID_case.combine_first(dataframe.ID_ctrl) dataframe = dataframe[['CHROM', 'POS', 'ID', 'REF', 'ALT']] extracted = self.get_extracted_variants().values return dataframe.loc[extracted]
def get_reduced_dataframe(self, *, case_vcf, ctrl_vcf) -> pd.DataFrame: """ Returns reduced dataframe """ cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve() assert cache.is_dir() from idiva.fextr import align with ctrl_vcf.rewind_when_done: ctrl_reader = ctrl_vcf with case_vcf.rewind_when_done: case_reader = case_vcf dataframe = align(ctrl=ctrl_reader, case=case_reader) id = dataframe.index dataframe['ID'] = dataframe.ID_case.combine_first(dataframe.ID_ctrl) dataframe = dataframe[['CHROM', 'POS', 'ID', 'REF', 'ALT']] extracted = self.get_extracted_variants().values return dataframe.loc[extracted]