def predict(self, vcf_test1, vcf_test2=None) -> pd.DataFrame: """ Returns predictions for the given vcf file """ if vcf_test2 is not None: # fuse two files into one dataframe x_test = self.dataHandler.create_test_set_v2(case_vcf=vcf_test1, ctrl_vcf=vcf_test2) else: # create test features x_test = self.dataHandler.create_test_set(vcf_test1) # make predictions x_test = x_test[[ 'CHROM', 'POS', 'CADD_SUCC', 'CADD_PHRED', 'SIFT_SCORE', 'SIFT_SUCC', 'VAR' ]] log.info('Predicting class labels.') y_pred = self.model.predict( x_test.reindex(sorted(x_test.columns), axis=1)) # create dataframe df = pd.DataFrame(y_pred) return df
def train_phenomenet( self, args: TrainPhenomenetArgs ) -> typing.Tuple[tf.keras.callbacks.History, tf.keras.Sequential]: """ Trains the phenomenet with the given parameters. HK, 2020-12-07 """ log.info(f'Training phenomenet on {args.database} database') train_data, train_labels, eval_data, eval_labels, weights = self.get_phenomenet_data( args) phenomenet = Phenomenet(train_data.shape[1]) phenomenet = phenomenet.get_phenomenet() log.info(f'Training phenomenet for up to {args.epochs} epochs.') cb = tf.keras.callbacks.EarlyStopping( monitor='val_precision', min_delta=0, patience=args.early_stopping_patience, verbose=1, mode='max', baseline=None, restore_best_weights=True) history = phenomenet.fit(train_data, train_labels, validation_data=(eval_data, eval_labels), batch_size=args.batch_size, verbose=2, epochs=args.epochs, class_weight=weights, callbacks=[cb]) return history, phenomenet
def main(): try: process(**parse_args()) except KeyboardInterrupt: import logging log.info("Aborted. Bye-bye.") logging.shutdown()
def feature_extraction_chunks(self, ctrl_vcf_file: str, case_vcf_file: str): """ Returns a fitted Perceptron classifier for the given vcf files The classifier is trained in chunks where the chunks consist of a range of patient Therefore the classifier iterates columnwise over the vcf files The files are divided into equally many chunks and therefore the individual chunksize can differ """ log.info("Fit linear classifier and reduce number of variants") clf = Perceptron() cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve() assert cache.is_dir() # create unique index id = None with open(str(cache) + "/" + ctrl_vcf_file) as ctrl_vcf: ctrl_reader = ReadVCF(ctrl_vcf) with open(str(cache) + "/" + case_vcf_file) as case_vcf: case_reader = ReadVCF(case_vcf) dataframe = align(ctrl=ctrl_reader, case=case_reader) id = dataframe.index with open(str(cache) + "/" + ctrl_vcf_file) as ctrl_vcf: with open(str(cache) + "/" + case_vcf_file) as case_vcf: reader_ctrl = ReadVCF(ctrl_vcf) reader_case = ReadVCF(case_vcf) header_ctrl = reader_ctrl.header header_case = reader_case.header exclude = [2, 3, 5, 6, 7, 8] names_ctrl = [i for idx, i in enumerate(header_ctrl) if idx not in exclude] names_case = [i for idx, i in enumerate(header_case) if idx not in exclude] len_ctrl = len(header_ctrl) - 9 len_case = len(header_case) - 9 min_batch_size = min([len_ctrl, len_case, 50]) number_of_batches = int(max([np.ceil(len_ctrl / min_batch_size), np.ceil(len_case / min_batch_size)])) batch_size_ctrl = int(np.ceil(len_ctrl / number_of_batches)) batch_size_case = int(np.ceil(len_case / number_of_batches)) batches_ctrl = [i * batch_size_ctrl for i in range(number_of_batches)] batches_case = [i * batch_size_case for i in range(number_of_batches)] batches_ctrl.append(len_ctrl) batches_case.append(len_case) for idx in tqdm(range(number_of_batches), total=number_of_batches, postfix='feature selection'): clf = self.feature_extraction_batch(reader_ctrl, reader_case, names_ctrl, names_case, batches_ctrl, batches_case, idx, clf, id) return clf, id
def maker_clinvar() -> pd.DataFrame: """ creates the clinvar dataframe """ from idiva.db import clinvar_open from idiva.io import ReadVCF from idiva.db.clinvar import clinvar_to_df log.info('Making clinvar df.') with clinvar_open(which=clinvar_file) as fd: return clinvar_to_df(ReadVCF(fd))
def cache_df( name: str, key: typing.Union[str, typing.List[str]], df_maker: typing.Callable[[], pandas.DataFrame], ) -> pandas.DataFrame: """ Usage: df = cache_df(name="my_dataframe", key="v0.1", df_maker=fn_that_makes_df) `name` should be a human-readable mnemonic that will appear as part of the file name. `key` is a string or list-of-strings that will be hashed and appended to file name. `df_maker` is a function that takes no arguments and returns a pandas DataFrame. RA, 2020-11-22 """ assert name assert key is not None, "Provide an empty list or an empty string as `key`." assert df_maker is not None if isinstance(key, list): key = "+".join(key) assert isinstance(key, str) key = base64.urlsafe_b64encode(hashlib.sha256( key.encode()).digest()).decode() name = name + (("__" + key[0:12]) if key else "") file = (BASE / name).with_suffix(".gz") assert not file.is_dir() if True: # Fix legacy filename old_name = name + (("_" + key[0:12]) if key else "") old_name = (BASE / old_name).with_suffix(".gz") if old_name.is_file(): import os os.rename(old_name, file) if file.is_file(): log.debug(F"Loading DataFrame from {file.name}.") df = pandas.read_csv(file, sep=SEP, compression="infer") df.set_index(df.columns[0], inplace=True) # https://stackoverflow.com/a/61390345 else: log.info( F"Creating a potentially large DataFrame for the first time ({file.name})." ) df = df_maker() log.debug(F"Saving DataFrame to {file.name}.") df.to_csv(file, sep=SEP, compression="gzip", index=True) return df
def checksum_md5(fd: io.TextIOBase) -> str: log.info("Computing md5 hash of stream.") import hashlib with seek_then_rewind(fd): md5 = hashlib.md5() try: for chunk in iter(lambda: fd.read(128 * md5.block_size).encode(), b''): md5.update(chunk) except KeyboardInterrupt: pass md5 = md5.hexdigest() log.debug(F"Computed md5 = {md5}.") return md5
def get_trained_phenomenet(which: str = 'exp_2020_12_07_14_53_54_611564'): """ Downloads trained phenomenet from polybox if not found in download folder. Which: the uid of the trained phenomenet. """ base = Path(__file__).parent.parent / 'download' URLS = { 'exp_2020_12_08_11_05_12_615223': 'https://polybox.ethz.ch/index.php/s/YxWDBaxle44f1da/download', 'exp_2020_12_07_14_53_54_611564': 'https://polybox.ethz.ch/index.php/s/bSwajA85feMAFiq/download' } exp_str = which model_path = base / exp_str if not model_path.exists(): command = f'wget -O {base / exp_str}.tar.gz {URLS[which]}' log.info(f'Downloading phenomenet with command {command}') os.system(command) command = f'tar -zxvf {base / exp_str}.tar.gz -C {base}' log.info(f'Extracting model with command {command}') os.system(command) log.info('removing compressed folder') os.system(f'rm {base / exp_str}.tar.gz') log.info('Loading trained phenomenet.') return tf.keras.models.load_model(model_path)
def get_train_test(data, pipeline: Pipeline = None): """ HK, 2020-11-12 """ log.info('Splitting and preprocessing data.') df_train, df_eval = train_test_split(data, test_size=0.2, shuffle=True) train_data = df_train.loc[:, df_train.columns != 'label'].to_numpy() train_labels = df_train['label'].to_numpy() eval_data = df_eval.loc[:, df_eval.columns != 'label'].to_numpy() eval_labels = df_eval['label'].to_numpy() pipeline.fit(train_data, train_labels) train_data = pipeline.transform(train_data) eval_data = pipeline.transform(eval_data) return train_data, train_labels, eval_data, eval_labels
def post(vcf_file: Path): log.info("=> Entering the postprocessing stage.") from idiva.stat.vcf_to_fisher import figure_pvalues from idiva.io.vcf import SEP with ReadVCF.open(vcf_file) as vcf: for px in figure_pvalues(vcf): file = vcf_file.parent / px.info['name proposal'] log.info(F"Saving figure and data to {file}.* .") px.f.savefig(file.with_suffix(".png")) df: pandas.DataFrame = px.info['df'] df.to_csv(file.with_suffix(".csv"), sep=SEP)
def allgwas_reference() -> pandas.DataFrame: from idiva.download import download log.info(F"Opening (cached) {URL}") with download(URL).now.open() as fd: log.info(F"Processing.") s: pandas.Series df = pandas.read_csv( fd, sep='\t', names=["gene", "rs", "p", "chrom", "pos", "disease"]) df = pandas.DataFrame(columns=["ID", "SC2D"], data=[(rs, disease) for (rs, disease) in zip(df.rs, df.disease) for rs in re.findall(r"rs[0-9]+", rs)]) df = df.groupby('ID', as_index=False).agg( {'SC2D': lambda s: '"' + ", ".join(sorted(set(s))) + '"'}) return df
def train_phenomenet(args: TrainPhenomenetArgs): cv = Classifier() exp = Experiment() checkpoint_dir = Path(__file__).parent.parent / 'clf/checkpoints' checkpoint_dir.mkdir(exist_ok=True, parents=True) checkpoint_path = checkpoint_dir / exp.experiment_uid # train phenomenet history, model = cv.train_phenomenet(args=args) end_epoch = len(history.history['loss']) values = {k: v[-1] for k, v in history.history.items()} values['model'] = 'phenomenet' values['epochs'] = end_epoch values = {**values, **args.__dict__} exp.save_experiment(values) log.info(f'saving model to {checkpoint_path}_{end_epoch}') model.save(checkpoint_path)
def get_dbSNP_df(which_dbSNP: int = 17) -> pd.DataFrame: """ which_dbSNP: integer indicating which chromosome is looked up in the dbSNP Returns the dbSNP for requested chrom as dataframe. Downloads it if not found, loads it otherwise """ from idiva.download import download import gzip from idiva.db.dbSNP_urls import dbSNP_URLs dbSNP_URL = dbSNP_URLs[which_dbSNP] log.info(f"Downloading dbSNP excerpt {which_dbSNP}.") with download(dbSNP_URL).now.open(mode='rb') as fd: log.info(f"Reading dbSNP excerpt {which_dbSNP}.") with gzip.open(fd, mode='r') as fd: df = pd.read_csv(fd) return df
def keras_tuner_rs(self, which_tuner: str, feature_list: typing.Optional[typing.Iterable[str]], database, exp_name: str): """ Launches keras tuner random search or hyperband optimization. HK, 2020-12-07 """ from idiva.clf.model_tuning.tuner import get_tuner args = TrainPhenomenetArgs(weighted_loss=True, database=database, feature_list=feature_list, early_stopping_patience=20) train_data, train_labels, eval_data, eval_labels, weights = self.get_phenomenet_data( args) cb = tf.keras.callbacks.EarlyStopping( monitor='val_precision', min_delta=0, patience=args.early_stopping_patience, verbose=1, mode='max', baseline=None, restore_best_weights=True) tuner = get_tuner(which_tuner, train_data.shape[1], exp_name) tuner.search_space_summary() tuner.search(x=train_data, y=train_labels, epochs=100, validation_data=(eval_data, eval_labels), class_weight=weights, batch_size=args.batch_size, callbacks=[cb], verbose=2) tuner.results_summary() best_model = tuner.get_best_models()[0] log.info('saving best tuner model.') best_model.save(f'best_tuner_model_{exp_name}')
def create_dbSNP_chrom_vcf(dl_path: Path, which_chrom: str = 'NC_000017.10', dbSNP_path: typing.Optional[Path] = None) -> Path: """ Extracts all variants for a corresponding chromosomes from the dbSNP. Downloads dbSNP if not found under dl_path or dbSMP_path. The extracted variants will be saved under dl_path. """ import hashlib import base64 key = base64.urlsafe_b64encode( hashlib.sha256(which_chrom.encode()).digest()).decode() file_names = { 'all_vcf': 'GRCh37_latest_dbSNP_all.vcf', 'all_gzip': 'GRCh37_latest_dbSNP_all.vcf.gz' } dbSNP_path = dbSNP_path or dl_path / file_names['all_vcf'] if not dbSNP_path and not os.path.exists(dbSNP_path): if not os.path.exists(dl_path / file_names['all_gzip']): log.info('Downloading dbSNP database.') wget_command = f'wget -P {dl_path} ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/' \ f'GRCh37_latest/refseq_identifiers/GRCh37_latest_dbSNP_all.vcf.gz' os.system(wget_command) else: log.info('Unpacking dbSNP database.') gunzip_command = 'gunzip GRCh37_latest_dbSNP_all.vcf.gz' os.system(gunzip_command) else: log.info('Unpacked dbSNP database found.') log.info(f'Extracting {which_chrom} from the dbSNP database.') grep_command = f"grep '^{which_chrom}' {dbSNP_path} > {dl_path}/temp.vcf" os.system(grep_command) out_name = f"dbSNP_{key[0:12]}.vcf" log.info(f'Extracting vcf header, creating file {dl_path}/{out_name}.') os.system(f"head -n38 {dbSNP_path} > {dl_path}/{out_name}") os.system( f'cat {dl_path}/temp.vcf >> {dl_path}/{out_name}; rm {dl_path}/temp.vcf' ) return dl_path / out_name
def create_dbSNP_df(dbSNP_file_path: Path, out_base: Path, which_chrom: typing.Union[int, str] = 17) -> None: """ Converts the dbSNP vcf file to a dataframe """ log.info( f"Converting {dbSNP_file_path} to out_base / f'GRCh37_latest_dbSNP_all_chrom{which_chrom}.csv.gz" ) out_path = out_base / f'GRCh37_latest_dbSNP_all_chrom{which_chrom}.csv.gz' print(out_path) assert out_base.exists() with open(dbSNP_file_path, mode='r') as fd: df = dbSNP_to_df(ReadVCF(fd), which_chrom='NC' if which_chrom == '_all' else f'NC_{str(which_chrom).zfill(6)}') df.to_csv(out_path, index=False, compression="gzip") if not len(df): log.warning(f'created dataframe is empty for chrom {which_chrom}')
def data(): log.info("Obtaining SC2Disease.") from idiva.db.sc2disease import allgwas_reference sc2d = dict(allgwas_reference()[["ID", "SC2D"]].set_index("ID").SC2D) log.info("Obtaining ClinVar.") from idiva.db.clinvar import clinvar_rsid_clnsig clnsig = dict(clinvar_rsid_clnsig()[["ID", "ClnSig" ]].set_index("ID").ClnSig) log.info("Reading p-values from VCF.") for dataline in tqdm(vcf): try: p = [ float( unlist1( re.findall(rF"{FX}=([^;]+);", dataline.info.strip() + ";"))) for FX in ["F0", "F1", "F2"] ] yield ( dataline.chrom, dataline.pos, dataline.id, *p, sc2d.get(dataline.id, "N/A"), clnsig.get(dataline.id, "N/A"), ) except ValueError: # Maybe encountered a '.' pass
def maker_merge_on_pos_ref_alt() -> pd.DataFrame: log.info("Creating database labels dataframe.") dbSNP_df = get_dbSNP_df(which_dbSNP) reduced_dbSNP = dbSNP_df.loc[(dbSNP_df.CLNSIG == 2) | (dbSNP_df.CLNSIG == 5)] def maker_clinvar() -> pd.DataFrame: """ creates the clinvar dataframe """ from idiva.db import clinvar_open from idiva.io import ReadVCF from idiva.db.clinvar import clinvar_to_df log.info('Making clinvar df.') with clinvar_open(which=clinvar_file) as fd: return clinvar_to_df(ReadVCF(fd)) df_clinvar = cache_df(name=("clinvar_" + clinvar_file), key=[clinvar_file], df_maker=maker_clinvar) df_clinvar_reduced = df_clinvar[df_clinvar['CLNSIG'].isin( {'Pathogenic', 'Benign'})] merge_on_pos_ref_alt = join_clinvar_dbSNP( df_clinvar=df_clinvar_reduced, df_dbSNP=reduced_dbSNP, with_chrom_col=with_chrom_col) types = { 'pos': int, 'ref': 'category', 'alt': 'category', 'class': int } if with_chrom_col: types['chrom'] = int return merge_on_pos_ref_alt.astype(types)
def setup_sift(self): import requests cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve() assert cache.is_dir() file_path = os.path.join(cache, 'SIFT4G_Annotator.jar') file_name = str(cache) + '/SIFT4G_Annotator.jar' if not os.path.isfile(file_path): log.info("Downloading sift annotator (1.3MB)") results = requests.get( 'https://github.com/pauline-ng/SIFT4G_Annotator/raw/master/SIFT4G_Annotator.jar' ) with open(file_name, 'wb') as f: f.write(results.content) else: log.info("sift annotator found") dir_path = os.path.join(cache, 'GRCh37.74') dir_name_gz = str(cache) + '/GRCh37.74.zip' if not os.path.isdir(dir_path): if not os.path.exists(dir_name_gz): log.info('Downloading GRCh37.34 database.') wget_command = 'wget -O ' + dir_name_gz + ' https://sift.bii.a-star.edu.sg/sift4g/public/Homo_sapiens/GRCh37.74.zip' os.system(wget_command) else: log.info('Unpacking GRCh37 database.') gunzip_command = 'unzip ' + dir_name_gz + ' -d ' + str(cache) os.system(gunzip_command) else: log.info('Unpacked GRCh37 database found.')
def get_trained_phenomenet(): """ Downloads trained phenomenet from polybox if not found in download folder """ base = Path(__file__).parent.parent / 'download' exp_str = 'exp_2020_12_07_14_53_54_611564' model_path = base / exp_str if not model_path.exists(): command = f'wget -O {base / exp_str}.tar.gz https://polybox.ethz.ch/index.php/s/bSwajA85feMAFiq/download' log.info(f'Downloading phenomenet with command {command}') os.system(command) command = f'tar -zxvf {base / exp_str}.tar.gz -C {base}' log.info(f'Extracting model with command {command}') os.system(command) log.info('removing compressed folder') os.system(f'rm {base / exp_str}.tar.gz') log.info('Loading trained phenomenet.') return tf.keras.models.load_model(model_path)
# HK, 2020-12-05 import idiva.io from idiva import log import typing import pandas as pd def db_classifier(*, case: idiva.io.ReadVCF, ctrl: idiva.io.ReadVCF) -> object: """ Classifies the case-control df by querying the clinvar and dbSNP data. """ from idiva.clf.df import c5_df from idiva.db import db log.info("Running the database classifier.") log.info('Retrieving case df.') case_control = c5_df(case) from idiva.clf.df import apply_dtype db_PosRefAlt = db.get_db_label_df( which_dbSNP=int(case_control.iloc[0]['CHROM'])) merge_on_PosRefAlt = case_control.merge(db_PosRefAlt, left_on=['POS', 'REF', 'ALT'], right_on=['pos', 'ref', 'alt'], how='left') # filling all missing values with 2, for "unknown" merge_on_PosRefAlt['class'] = merge_on_PosRefAlt['class'].fillna(2) log.info(
def create_test_set_v2(self, *, case_vcf, ctrl_vcf) -> pd.DataFrame: """ creates test set by first reducing the number of samples and then adding sift and cadd scores """ cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve() assert cache.is_dir() file_name = 'test.csv' file_path = os.path.join(cache, file_name) file_name = str(cache) + '/' + file_name dataframe_base = None # if file does not exists if not os.path.isfile(file_path): log.info("Annotate test set (~1h)") fextr = FeatureExtractor(case_vcf=case_vcf, ctrl_vcf=ctrl_vcf) dataframe_base = fextr.get_reduced_dataframe(case_vcf=case_vcf, ctrl_vcf=ctrl_vcf) dataframe_sift = self.add_sift_score(dataframe_base, 'our') dataframe_sift['CHROM'] = pd.to_numeric( dataframe_sift[['CHROM']].apply(self.translate_chrom, axis=1)) dataframe_cadd = dataframe_base dataframe_cadd = dataframe_cadd[[ 'CHROM', 'POS', 'ID', 'REF', 'ALT' ]] dataframe_cadd = self.add_cadd_score(dataframe_cadd) dataframe = dataframe_cadd dataframe[['SIFT_SCORE', 'SIFT_SUCC' ]] = dataframe_sift[['SIFT_SCORE', 'SIFT_SUCC']] dataframe['SIFT_SUCC'] = dataframe['SIFT_SUCC'].fillna(value=0) dataframe['SIFT_SCORE'] = dataframe['SIFT_SCORE'].fillna( value=0.05) dataframe['CADD_SUCC'] = dataframe['CADD_SUCC'].fillna(value=0) dataframe['CADD_PHRED'] = dataframe['CADD_PHRED'].fillna(value=30) dataframe = self.encode_ref_alt(dataframe) dataframe = dataframe[[ 'CHROM', 'POS', 'VAR', 'CADD_PHRED', 'CADD_SUCC', 'SIFT_SCORE', 'SIFT_SUCC' ]] cols = [ 'CHROM', 'POS', 'VAR', 'CADD_PHRED', 'CADD_SUCC', 'SIFT_SCORE', 'SIFT_SUCC' ] dataframe[cols] = dataframe[cols].apply(pd.to_numeric, errors='coerce', axis=1) log.info( "Test set is stored as /input/download_cache/test.csv. " "If you want to create a new test set for another vcf dataset you have to delete the previous one " ) dataframe.to_csv(file_name, sep='\t') # load stored test set else: log.info("load stored test set") dataframe = pd.read_csv(file_name, sep='\t', index_col='CPA_ID') return dataframe
def process(*, case_vcf: Path, ctrl_vcf: Path, out_dir: Path): from idiva.io import open_maybe_gz from idiva.io import head with open_maybe_gz(case_vcf) as case_full, open_maybe_gz(ctrl_vcf) as ctrl_full: assert isinstance(case_full, io.TextIOBase) assert isinstance(ctrl_full, io.TextIOBase) with head(case_full) as case_head, head(ctrl_full) as ctrl_head: log.info("======================") log.info("Processing VCF (HEAD).") log.info("======================") post(process_vcf(case=ReadVCF(case_head), ctrl=ReadVCF(ctrl_head), out=(out_dir / "head"))) log.info("======================") log.info("Processing VCF (FULL).") log.info("======================") post(process_vcf(case=ReadVCF(case_full), ctrl=ReadVCF(ctrl_full), out=(out_dir / "full")))
info_supp = c5_df(case) info_meta = [] # # # from idiva.clf.placeholder import placeholder, failure from idiva.stat.vcf_to_fisher import vcf_to_fisher from idiva.db.sc2disease import allgwas from idiva.db.db_clf import db_classifier classifiers = [vcf_to_fisher, allgwas, db_classifier] for classifier in classifiers: with case.rewind_when_done: log.info(F"=> Invoking the annotation `{classifier.__name__}`.") try: response = classifier(case=case, ctrl=ctrl) info_meta.append(response.info) assert set(response.id_cols).issubset(set(info_supp.columns)) df = response.df[set(response.id_cols) | set(response.info.keys())] info_supp = join(case=info_supp, ctrl=df, how="left", on=list(response.id_cols)) del response except KeyboardInterrupt: raise except Exception as ex: log.exception(F"=> Annotation `{classifier.__name__}` failed ({ex}).") else:
def add_sift_score(self, dataframe: pd.DataFrame, type: str) -> pd.DataFrame: """ Appends sift scores and success to dataframe The dataframe needs at least following columns: CHROM, POS, REF, ALT """ self.setup_sift() log.info("creating sift scores for " + type) # https://github.com/pauline-ng/SIFT4G_Annotator/raw/master/SIFT4G_Annotator.jar # make dataframe compatible for sift annotator if 'ID' not in dataframe: dataframe['ID'] = '.' if 'QUAL' not in dataframe: dataframe['QUAL'] = '.' if 'FILTER' not in dataframe: dataframe['FILTER'] = '.' if 'INFO' not in dataframe: dataframe['INFO'] = '.' dataframe = dataframe.fillna(value=".") cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve() assert cache.is_dir() file_path = str(cache) + "/" + type + "_sift.vcf" # create vcf file dataframe.rename(columns={ 'CHROM': '#CHROM' }).to_csv(file_path, sep='\t', index=False) file_name = type sift_folder = str(cache) + "/" + file_name + "_sift" cmd = 'java -jar ' + str( cache) + '/SIFT4G_Annotator.jar -c -i ' + file_path + ' -d ' + str( cache) + '/GRCh37.74 -r ' + sift_folder args = shlex.split(cmd) process = Popen(args) process.wait() sift_file = sift_folder + "/" + file_name + "_sift_SIFTannotations.xls" sift_dataframe = pd.read_table(sift_file) dataframe['SIFT_SCORE'] = np.nan dataframe['SIFT_SUCC'] = np.nan sift_iter = sift_dataframe.iterrows() next_sift = next(sift_iter, None) for idx, row in tqdm(dataframe.iterrows(), total=len(dataframe), postfix='inserting sift scores'): if next_sift is not None: if row['POS'] == next_sift[1]['POS']: if not pd.isna(next_sift[1]['SIFT_SCORE']): sift_score = next_sift[1]['SIFT_SCORE'] sift_succ = 1 else: sift_score = np.nan sift_succ = 0 next_sift = next(sift_iter, None) else: sift_score = np.nan sift_succ = 0 else: sift_score = np.nan sift_succ = 0 dataframe.loc[idx, 'SIFT_SCORE'] = sift_score dataframe.loc[idx, 'SIFT_SUCC'] = sift_succ return dataframe
def preprocess_clinvar(self, which='vcf_37'): log.info("preprocessing clinvar file") dataframe_base = self.get_clinvar_clf_extended() dataframe_base = dataframe_base.reset_index(drop=True) labels = dataframe_base['labels'] dataframe_base = dataframe_base.drop('labels', axis=1) dataframe_sift = self.add_sift_score(dataframe_base, 'clinvar') dataframe_sift['CHROM'] = pd.to_numeric( dataframe_sift[['CHROM']].apply(self.translate_chrom, axis=1)) dataframe_cadd = dataframe_base dataframe_cadd['CHROM'] = pd.to_numeric( dataframe_cadd[['CHROM']].apply(self.translate_chrom, axis=1)) dataframe_cadd = dataframe_cadd[['CHROM', 'POS', 'ID', 'REF', 'ALT']] dataframe_cadd = dataframe_cadd.reset_index(drop=True) cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve() assert cache.is_dir() # get cadd annotations from idiva.download import download data = download( 'https://polybox.ethz.ch/index.php/s/GRgDYHOAaw75D60/download').now with data.open() as fd: cadd_scores = pd.read_csv(fd, sep='\t', usecols=range(1, 6), comment='#', names=[ 'CHROM', 'POS', 'REF', 'ALT', 'CADD_SCORE', 'CADD_PHRED' ]) cadd_scores['CADD_SUCC'] = 1 dataframe_cadd[['CADD_PHRED', 'CADD_SUCC' ]] = cadd_scores[['CADD_PHRED', 'CADD_SUCC']] dataframe = dataframe_cadd dataframe[['SIFT_SCORE', 'SIFT_SUCC']] = dataframe_sift[['SIFT_SCORE', 'SIFT_SUCC']] dataframe['SIFT_SCORE'] = dataframe['SIFT_SCORE'].fillna(value=0.05) dataframe['CADD_SUCC'] = dataframe['CADD_SUCC'].fillna(value=0) dataframe['CADD_PHRED'] = dataframe['CADD_PHRED'].fillna(value=30) dataframe = self.encode_ref_alt(dataframe) dataframe = dataframe[[ 'CHROM', 'POS', 'VAR', 'CADD_PHRED', 'CADD_SUCC', 'SIFT_SCORE', 'SIFT_SUCC' ]] dataframe.insert(7, 'label', labels) cols = [ 'CHROM', 'POS', 'VAR', 'CADD_PHRED', 'CADD_SUCC', 'SIFT_SCORE', 'SIFT_SUCC', 'label' ] dataframe[cols] = dataframe[cols].apply(pd.to_numeric, errors='coerce', axis=1) file_path = str(cache) + "/training.csv" dataframe.to_csv(file_path, sep='\t', index=False) return dataframe
# RA, 2020-12-04 # Template for a classifier interface to main. # Make a copy for your implementation. import idiva.io from idiva import log def placeholder(*, case: idiva.io.ReadVCF, ctrl: idiva.io.ReadVCF): from idiva.clf.df import c5_df log.info("Running the `placeholder`.") # DUMMY result dataframe result = c5_df(case).assign(Dummy1=1, Dummy2=2.0) # The result should contain the columns # CHROM, POS, ID # or # CHROM, POS, ALT # or # CHROM, POS, REF, ALT # but can have fewer rows than `case`. # # Specify these columns in `id_cols`. class response: id_cols = ["CHROM", "POS", "ID"] info = {
def save_experiment(self, values: dict): log.info(f'Writing to experiment dataframe: {values}') values['experiment_uid'] = self.experiment_uid experiments_dataframe = self.experiments_dataframe.append( values, ignore_index=True) experiments_dataframe.to_csv(self.experiment_df, index=False)
import typing import re import pandas import idiva.io import idiva.utils from idiva import log from tqdm import tqdm def vcf_to_fisher(*, case: idiva.io.ReadVCF, ctrl: idiva.io.ReadVCF): from idiva.io import cache_df from idiva.clf.df import v0_df, join from idiva.stat import v0_fisher log.info("Creating a slim case/ctrl dataframe.") def df_maker1(): # Note: v0_df is parallelized return join(case=v0_df(case), ctrl=v0_df(ctrl)) cached_file_prefix = "case_ctrl__v0df" df = cache_df(cached_file_prefix, key=[case.md5, ctrl.md5], df_maker=df_maker1) log.debug(df) log.info("Computing the p-values case vs control.") def df_maker2():
# HK, 2020-12-08 import idiva.io from idiva import log from idiva.clf.utils import get_trained_phenomenet from idiva.dh import datahandler def phenomenet_classifier_basic(*, case: idiva.io.ReadVCF, ctrl: idiva.io.ReadVCF) -> object: """ Classifies the case-control df with a pretrained classifier. """ from idiva.clf.df import c5_df log.info("Running the phenomenet classifier.") model = get_trained_phenomenet(which='exp_2020_12_08_11_05_12_615223') case_control = c5_df(case) case_control['var'] = case_control[['REF', 'ALT']].apply( lambda x: datahandler.MAPPING[x[0]][x[1]], axis=1) clf_data = case_control[['CHROM', 'POS', 'var']] # columns need to be in the same order than they were for the training of the classifier: clf_data = clf_data.reindex(sorted(clf_data.columns), axis=1) predictions = model.predict(clf_data.to_numpy().astype('float32')) case_control['phenom_class'] = predictions class response: