Exemplo n.º 1
0
    def predict(self, vcf_test1, vcf_test2=None) -> pd.DataFrame:
        """
        Returns predictions for the given vcf file
        """

        if vcf_test2 is not None:
            # fuse two files into one dataframe
            x_test = self.dataHandler.create_test_set_v2(case_vcf=vcf_test1,
                                                         ctrl_vcf=vcf_test2)

        else:
            # create test features
            x_test = self.dataHandler.create_test_set(vcf_test1)

        # make predictions
        x_test = x_test[[
            'CHROM', 'POS', 'CADD_SUCC', 'CADD_PHRED', 'SIFT_SCORE',
            'SIFT_SUCC', 'VAR'
        ]]
        log.info('Predicting class labels.')
        y_pred = self.model.predict(
            x_test.reindex(sorted(x_test.columns), axis=1))

        # create dataframe
        df = pd.DataFrame(y_pred)

        return df
Exemplo n.º 2
0
    def train_phenomenet(
        self, args: TrainPhenomenetArgs
    ) -> typing.Tuple[tf.keras.callbacks.History, tf.keras.Sequential]:
        """
        Trains the phenomenet with the given parameters.
        HK, 2020-12-07
        """
        log.info(f'Training phenomenet on {args.database} database')
        train_data, train_labels, eval_data, eval_labels, weights = self.get_phenomenet_data(
            args)
        phenomenet = Phenomenet(train_data.shape[1])
        phenomenet = phenomenet.get_phenomenet()
        log.info(f'Training phenomenet for up to {args.epochs} epochs.')

        cb = tf.keras.callbacks.EarlyStopping(
            monitor='val_precision',
            min_delta=0,
            patience=args.early_stopping_patience,
            verbose=1,
            mode='max',
            baseline=None,
            restore_best_weights=True)

        history = phenomenet.fit(train_data,
                                 train_labels,
                                 validation_data=(eval_data, eval_labels),
                                 batch_size=args.batch_size,
                                 verbose=2,
                                 epochs=args.epochs,
                                 class_weight=weights,
                                 callbacks=[cb])

        return history, phenomenet
Exemplo n.º 3
0
def main():
    try:
        process(**parse_args())
    except KeyboardInterrupt:
        import logging
        log.info("Aborted. Bye-bye.")
        logging.shutdown()
    def feature_extraction_chunks(self, ctrl_vcf_file: str, case_vcf_file: str):
        """
        Returns a fitted Perceptron classifier for the given vcf files
        The classifier is trained in chunks where the chunks consist of a range of patient
        Therefore the classifier iterates columnwise over the vcf files
        The files are divided into equally many chunks and therefore the individual chunksize can differ
        """
        log.info("Fit linear classifier and reduce number of variants")

        clf = Perceptron()

        cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve()
        assert cache.is_dir()

        # create unique index
        id = None

        with open(str(cache) + "/" + ctrl_vcf_file) as ctrl_vcf:
            ctrl_reader = ReadVCF(ctrl_vcf)
            with open(str(cache) + "/" + case_vcf_file) as case_vcf:
                case_reader = ReadVCF(case_vcf)
                dataframe = align(ctrl=ctrl_reader, case=case_reader)
                id = dataframe.index

        with open(str(cache) + "/" + ctrl_vcf_file) as ctrl_vcf:
            with open(str(cache) + "/" + case_vcf_file) as case_vcf:
                reader_ctrl = ReadVCF(ctrl_vcf)
                reader_case = ReadVCF(case_vcf)

                header_ctrl = reader_ctrl.header

                header_case = reader_case.header

                exclude = [2, 3, 5, 6, 7, 8]

                names_ctrl = [i for idx, i in enumerate(header_ctrl) if idx not in exclude]
                names_case = [i for idx, i in enumerate(header_case) if idx not in exclude]

                len_ctrl = len(header_ctrl) - 9
                len_case = len(header_case) - 9

                min_batch_size = min([len_ctrl, len_case, 50])

                number_of_batches = int(max([np.ceil(len_ctrl / min_batch_size), np.ceil(len_case / min_batch_size)]))

                batch_size_ctrl = int(np.ceil(len_ctrl / number_of_batches))
                batch_size_case = int(np.ceil(len_case / number_of_batches))

                batches_ctrl = [i * batch_size_ctrl for i in range(number_of_batches)]
                batches_case = [i * batch_size_case for i in range(number_of_batches)]

                batches_ctrl.append(len_ctrl)
                batches_case.append(len_case)

                for idx in tqdm(range(number_of_batches), total=number_of_batches, postfix='feature selection'):
                    clf = self.feature_extraction_batch(reader_ctrl, reader_case, names_ctrl, names_case,
                                                        batches_ctrl, batches_case, idx, clf, id)

        return clf, id
Exemplo n.º 5
0
 def maker_clinvar() -> pd.DataFrame:
     """
     creates the clinvar dataframe
     """
     from idiva.db import clinvar_open
     from idiva.io import ReadVCF
     from idiva.db.clinvar import clinvar_to_df
     log.info('Making clinvar df.')
     with clinvar_open(which=clinvar_file) as fd:
         return clinvar_to_df(ReadVCF(fd))
Exemplo n.º 6
0
def cache_df(
    name: str,
    key: typing.Union[str, typing.List[str]],
    df_maker: typing.Callable[[], pandas.DataFrame],
) -> pandas.DataFrame:
    """
    Usage:
        df = cache_df(name="my_dataframe", key="v0.1", df_maker=fn_that_makes_df)

    `name` should be a human-readable mnemonic that will appear as part of the file name.
    `key` is a string or list-of-strings that will be hashed and appended to file name.
    `df_maker` is a function that takes no arguments and returns a pandas DataFrame.

    RA, 2020-11-22
    """

    assert name
    assert key is not None, "Provide an empty list or an empty string as `key`."
    assert df_maker is not None

    if isinstance(key, list):
        key = "+".join(key)

    assert isinstance(key, str)

    key = base64.urlsafe_b64encode(hashlib.sha256(
        key.encode()).digest()).decode()

    name = name + (("__" + key[0:12]) if key else "")
    file = (BASE / name).with_suffix(".gz")
    assert not file.is_dir()

    if True:
        # Fix legacy filename
        old_name = name + (("_" + key[0:12]) if key else "")
        old_name = (BASE / old_name).with_suffix(".gz")
        if old_name.is_file():
            import os
            os.rename(old_name, file)

    if file.is_file():
        log.debug(F"Loading DataFrame from {file.name}.")
        df = pandas.read_csv(file, sep=SEP, compression="infer")
        df.set_index(df.columns[0], inplace=True)
        # https://stackoverflow.com/a/61390345
    else:
        log.info(
            F"Creating a potentially large DataFrame for the first time ({file.name})."
        )
        df = df_maker()
        log.debug(F"Saving DataFrame to {file.name}.")
        df.to_csv(file, sep=SEP, compression="gzip", index=True)

    return df
Exemplo n.º 7
0
def checksum_md5(fd: io.TextIOBase) -> str:
    log.info("Computing md5 hash of stream.")
    import hashlib
    with seek_then_rewind(fd):
        md5 = hashlib.md5()
        try:
            for chunk in iter(lambda: fd.read(128 * md5.block_size).encode(),
                              b''):
                md5.update(chunk)
        except KeyboardInterrupt:
            pass
    md5 = md5.hexdigest()
    log.debug(F"Computed md5 = {md5}.")
    return md5
Exemplo n.º 8
0
def get_trained_phenomenet(which: str = 'exp_2020_12_07_14_53_54_611564'):
    """
    Downloads trained phenomenet from polybox if not found in download folder.
    Which: the uid of the trained phenomenet.
    """
    base = Path(__file__).parent.parent / 'download'

    URLS = {
        'exp_2020_12_08_11_05_12_615223':
        'https://polybox.ethz.ch/index.php/s/YxWDBaxle44f1da/download',
        'exp_2020_12_07_14_53_54_611564':
        'https://polybox.ethz.ch/index.php/s/bSwajA85feMAFiq/download'
    }

    exp_str = which
    model_path = base / exp_str
    if not model_path.exists():
        command = f'wget -O {base / exp_str}.tar.gz {URLS[which]}'
        log.info(f'Downloading phenomenet with command {command}')
        os.system(command)
        command = f'tar -zxvf {base / exp_str}.tar.gz -C {base}'
        log.info(f'Extracting model with command {command}')
        os.system(command)
        log.info('removing compressed folder')
        os.system(f'rm {base / exp_str}.tar.gz')
    log.info('Loading trained phenomenet.')
    return tf.keras.models.load_model(model_path)
Exemplo n.º 9
0
def get_train_test(data, pipeline: Pipeline = None):
    """
    HK, 2020-11-12
    """
    log.info('Splitting and preprocessing data.')
    df_train, df_eval = train_test_split(data, test_size=0.2, shuffle=True)

    train_data = df_train.loc[:, df_train.columns != 'label'].to_numpy()
    train_labels = df_train['label'].to_numpy()
    eval_data = df_eval.loc[:, df_eval.columns != 'label'].to_numpy()
    eval_labels = df_eval['label'].to_numpy()
    pipeline.fit(train_data, train_labels)
    train_data = pipeline.transform(train_data)
    eval_data = pipeline.transform(eval_data)
    return train_data, train_labels, eval_data, eval_labels
Exemplo n.º 10
0
def post(vcf_file: Path):
    log.info("=> Entering the postprocessing stage.")

    from idiva.stat.vcf_to_fisher import figure_pvalues
    from idiva.io.vcf import SEP

    with ReadVCF.open(vcf_file) as vcf:
        for px in figure_pvalues(vcf):
            file = vcf_file.parent / px.info['name proposal']
            log.info(F"Saving figure and data to {file}.* .")

            px.f.savefig(file.with_suffix(".png"))

            df: pandas.DataFrame = px.info['df']
            df.to_csv(file.with_suffix(".csv"), sep=SEP)
Exemplo n.º 11
0
def allgwas_reference() -> pandas.DataFrame:
    from idiva.download import download
    log.info(F"Opening (cached) {URL}")
    with download(URL).now.open() as fd:
        log.info(F"Processing.")
        s: pandas.Series
        df = pandas.read_csv(
            fd, sep='\t', names=["gene", "rs", "p", "chrom", "pos", "disease"])
        df = pandas.DataFrame(columns=["ID", "SC2D"],
                              data=[(rs, disease)
                                    for (rs,
                                         disease) in zip(df.rs, df.disease)
                                    for rs in re.findall(r"rs[0-9]+", rs)])
        df = df.groupby('ID', as_index=False).agg(
            {'SC2D': lambda s: '"' + ", ".join(sorted(set(s))) + '"'})
        return df
Exemplo n.º 12
0
def train_phenomenet(args: TrainPhenomenetArgs):
    cv = Classifier()

    exp = Experiment()
    checkpoint_dir = Path(__file__).parent.parent / 'clf/checkpoints'
    checkpoint_dir.mkdir(exist_ok=True, parents=True)
    checkpoint_path = checkpoint_dir / exp.experiment_uid

    # train phenomenet
    history, model = cv.train_phenomenet(args=args)
    end_epoch = len(history.history['loss'])
    values = {k: v[-1] for k, v in history.history.items()}
    values['model'] = 'phenomenet'
    values['epochs'] = end_epoch
    values = {**values, **args.__dict__}
    exp.save_experiment(values)
    log.info(f'saving model to {checkpoint_path}_{end_epoch}')
    model.save(checkpoint_path)
Exemplo n.º 13
0
def get_dbSNP_df(which_dbSNP: int = 17) -> pd.DataFrame:
    """
    which_dbSNP: integer indicating which chromosome is looked up in the dbSNP

    Returns the dbSNP for requested chrom as dataframe.
    Downloads it if not found, loads it otherwise
    """
    from idiva.download import download
    import gzip
    from idiva.db.dbSNP_urls import dbSNP_URLs
    dbSNP_URL = dbSNP_URLs[which_dbSNP]

    log.info(f"Downloading dbSNP excerpt {which_dbSNP}.")
    with download(dbSNP_URL).now.open(mode='rb') as fd:
        log.info(f"Reading dbSNP excerpt {which_dbSNP}.")
        with gzip.open(fd, mode='r') as fd:
            df = pd.read_csv(fd)

    return df
Exemplo n.º 14
0
    def keras_tuner_rs(self, which_tuner: str,
                       feature_list: typing.Optional[typing.Iterable[str]],
                       database, exp_name: str):
        """
        Launches keras tuner random search or hyperband optimization.
        HK, 2020-12-07
        """
        from idiva.clf.model_tuning.tuner import get_tuner

        args = TrainPhenomenetArgs(weighted_loss=True,
                                   database=database,
                                   feature_list=feature_list,
                                   early_stopping_patience=20)
        train_data, train_labels, eval_data, eval_labels, weights = self.get_phenomenet_data(
            args)

        cb = tf.keras.callbacks.EarlyStopping(
            monitor='val_precision',
            min_delta=0,
            patience=args.early_stopping_patience,
            verbose=1,
            mode='max',
            baseline=None,
            restore_best_weights=True)

        tuner = get_tuner(which_tuner, train_data.shape[1], exp_name)

        tuner.search_space_summary()

        tuner.search(x=train_data,
                     y=train_labels,
                     epochs=100,
                     validation_data=(eval_data, eval_labels),
                     class_weight=weights,
                     batch_size=args.batch_size,
                     callbacks=[cb],
                     verbose=2)

        tuner.results_summary()
        best_model = tuner.get_best_models()[0]
        log.info('saving best tuner model.')
        best_model.save(f'best_tuner_model_{exp_name}')
Exemplo n.º 15
0
def create_dbSNP_chrom_vcf(dl_path: Path,
                           which_chrom: str = 'NC_000017.10',
                           dbSNP_path: typing.Optional[Path] = None) -> Path:
    """
    Extracts all variants for a corresponding chromosomes from the dbSNP.
    Downloads dbSNP if not found under dl_path or dbSMP_path. The extracted variants will be saved under dl_path.
    """
    import hashlib
    import base64
    key = base64.urlsafe_b64encode(
        hashlib.sha256(which_chrom.encode()).digest()).decode()
    file_names = {
        'all_vcf': 'GRCh37_latest_dbSNP_all.vcf',
        'all_gzip': 'GRCh37_latest_dbSNP_all.vcf.gz'
    }
    dbSNP_path = dbSNP_path or dl_path / file_names['all_vcf']

    if not dbSNP_path and not os.path.exists(dbSNP_path):
        if not os.path.exists(dl_path / file_names['all_gzip']):
            log.info('Downloading dbSNP database.')
            wget_command = f'wget -P {dl_path} ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/' \
                           f'GRCh37_latest/refseq_identifiers/GRCh37_latest_dbSNP_all.vcf.gz'
            os.system(wget_command)
        else:
            log.info('Unpacking dbSNP database.')
            gunzip_command = 'gunzip GRCh37_latest_dbSNP_all.vcf.gz'
            os.system(gunzip_command)
    else:
        log.info('Unpacked dbSNP database found.')

    log.info(f'Extracting {which_chrom} from the dbSNP database.')
    grep_command = f"grep '^{which_chrom}' {dbSNP_path} > {dl_path}/temp.vcf"
    os.system(grep_command)
    out_name = f"dbSNP_{key[0:12]}.vcf"
    log.info(f'Extracting vcf header, creating file {dl_path}/{out_name}.')
    os.system(f"head -n38 {dbSNP_path} > {dl_path}/{out_name}")
    os.system(
        f'cat {dl_path}/temp.vcf >> {dl_path}/{out_name}; rm {dl_path}/temp.vcf'
    )

    return dl_path / out_name
Exemplo n.º 16
0
def create_dbSNP_df(dbSNP_file_path: Path,
                    out_base: Path,
                    which_chrom: typing.Union[int, str] = 17) -> None:
    """
    Converts the dbSNP vcf file to a dataframe
    """
    log.info(
        f"Converting {dbSNP_file_path} to out_base / f'GRCh37_latest_dbSNP_all_chrom{which_chrom}.csv.gz"
    )
    out_path = out_base / f'GRCh37_latest_dbSNP_all_chrom{which_chrom}.csv.gz'
    print(out_path)
    assert out_base.exists()

    with open(dbSNP_file_path, mode='r') as fd:
        df = dbSNP_to_df(ReadVCF(fd),
                         which_chrom='NC' if which_chrom == '_all' else
                         f'NC_{str(which_chrom).zfill(6)}')

    df.to_csv(out_path, index=False, compression="gzip")
    if not len(df):
        log.warning(f'created dataframe is empty for chrom {which_chrom}')
        def data():
            log.info("Obtaining SC2Disease.")
            from idiva.db.sc2disease import allgwas_reference
            sc2d = dict(allgwas_reference()[["ID",
                                             "SC2D"]].set_index("ID").SC2D)

            log.info("Obtaining ClinVar.")
            from idiva.db.clinvar import clinvar_rsid_clnsig
            clnsig = dict(clinvar_rsid_clnsig()[["ID", "ClnSig"
                                                 ]].set_index("ID").ClnSig)

            log.info("Reading p-values from VCF.")
            for dataline in tqdm(vcf):
                try:
                    p = [
                        float(
                            unlist1(
                                re.findall(rF"{FX}=([^;]+);",
                                           dataline.info.strip() + ";")))
                        for FX in ["F0", "F1", "F2"]
                    ]
                    yield (
                        dataline.chrom,
                        dataline.pos,
                        dataline.id,
                        *p,
                        sc2d.get(dataline.id, "N/A"),
                        clnsig.get(dataline.id, "N/A"),
                    )
                except ValueError:
                    # Maybe encountered a '.'
                    pass
Exemplo n.º 18
0
    def maker_merge_on_pos_ref_alt() -> pd.DataFrame:
        log.info("Creating database labels dataframe.")
        dbSNP_df = get_dbSNP_df(which_dbSNP)
        reduced_dbSNP = dbSNP_df.loc[(dbSNP_df.CLNSIG == 2) |
                                     (dbSNP_df.CLNSIG == 5)]

        def maker_clinvar() -> pd.DataFrame:
            """
            creates the clinvar dataframe
            """
            from idiva.db import clinvar_open
            from idiva.io import ReadVCF
            from idiva.db.clinvar import clinvar_to_df
            log.info('Making clinvar df.')
            with clinvar_open(which=clinvar_file) as fd:
                return clinvar_to_df(ReadVCF(fd))

        df_clinvar = cache_df(name=("clinvar_" + clinvar_file),
                              key=[clinvar_file],
                              df_maker=maker_clinvar)
        df_clinvar_reduced = df_clinvar[df_clinvar['CLNSIG'].isin(
            {'Pathogenic', 'Benign'})]

        merge_on_pos_ref_alt = join_clinvar_dbSNP(
            df_clinvar=df_clinvar_reduced,
            df_dbSNP=reduced_dbSNP,
            with_chrom_col=with_chrom_col)

        types = {
            'pos': int,
            'ref': 'category',
            'alt': 'category',
            'class': int
        }
        if with_chrom_col:
            types['chrom'] = int
        return merge_on_pos_ref_alt.astype(types)
Exemplo n.º 19
0
    def setup_sift(self):

        import requests

        cache = (Path(__file__).parent.parent.parent.parent /
                 "input/download_cache").resolve()
        assert cache.is_dir()

        file_path = os.path.join(cache, 'SIFT4G_Annotator.jar')

        file_name = str(cache) + '/SIFT4G_Annotator.jar'

        if not os.path.isfile(file_path):
            log.info("Downloading sift annotator (1.3MB)")
            results = requests.get(
                'https://github.com/pauline-ng/SIFT4G_Annotator/raw/master/SIFT4G_Annotator.jar'
            )
            with open(file_name, 'wb') as f:
                f.write(results.content)
        else:
            log.info("sift annotator found")

        dir_path = os.path.join(cache, 'GRCh37.74')

        dir_name_gz = str(cache) + '/GRCh37.74.zip'

        if not os.path.isdir(dir_path):
            if not os.path.exists(dir_name_gz):
                log.info('Downloading GRCh37.34 database.')
                wget_command = 'wget -O ' + dir_name_gz + ' https://sift.bii.a-star.edu.sg/sift4g/public/Homo_sapiens/GRCh37.74.zip'
                os.system(wget_command)
            else:
                log.info('Unpacking GRCh37 database.')
                gunzip_command = 'unzip ' + dir_name_gz + ' -d ' + str(cache)
                os.system(gunzip_command)
        else:
            log.info('Unpacked GRCh37 database found.')
Exemplo n.º 20
0
def get_trained_phenomenet():
    """
    Downloads trained phenomenet from polybox if not found in download folder
    """
    base = Path(__file__).parent.parent / 'download'

    exp_str = 'exp_2020_12_07_14_53_54_611564'
    model_path = base / exp_str
    if not model_path.exists():
        command = f'wget -O {base / exp_str}.tar.gz https://polybox.ethz.ch/index.php/s/bSwajA85feMAFiq/download'
        log.info(f'Downloading phenomenet with command {command}')
        os.system(command)
        command = f'tar -zxvf {base / exp_str}.tar.gz -C {base}'
        log.info(f'Extracting model with command {command}')
        os.system(command)
        log.info('removing compressed folder')
        os.system(f'rm {base / exp_str}.tar.gz')
    log.info('Loading trained phenomenet.')
    return tf.keras.models.load_model(model_path)
Exemplo n.º 21
0
# HK, 2020-12-05

import idiva.io
from idiva import log
import typing
import pandas as pd


def db_classifier(*, case: idiva.io.ReadVCF, ctrl: idiva.io.ReadVCF) -> object:
    """
    Classifies the case-control df by querying the clinvar and dbSNP data.
    """
    from idiva.clf.df import c5_df
    from idiva.db import db

    log.info("Running the database classifier.")

    log.info('Retrieving case df.')
    case_control = c5_df(case)

    from idiva.clf.df import apply_dtype
    db_PosRefAlt = db.get_db_label_df(
        which_dbSNP=int(case_control.iloc[0]['CHROM']))

    merge_on_PosRefAlt = case_control.merge(db_PosRefAlt,
                                            left_on=['POS', 'REF', 'ALT'],
                                            right_on=['pos', 'ref', 'alt'],
                                            how='left')
    # filling all missing values with 2, for "unknown"
    merge_on_PosRefAlt['class'] = merge_on_PosRefAlt['class'].fillna(2)
    log.info(
Exemplo n.º 22
0
    def create_test_set_v2(self, *, case_vcf, ctrl_vcf) -> pd.DataFrame:
        """
        creates test set by first reducing the number of samples and then adding sift and cadd scores
        """
        cache = (Path(__file__).parent.parent.parent.parent /
                 "input/download_cache").resolve()
        assert cache.is_dir()

        file_name = 'test.csv'

        file_path = os.path.join(cache, file_name)

        file_name = str(cache) + '/' + file_name

        dataframe_base = None

        # if file does not exists
        if not os.path.isfile(file_path):
            log.info("Annotate test set (~1h)")

            fextr = FeatureExtractor(case_vcf=case_vcf, ctrl_vcf=ctrl_vcf)
            dataframe_base = fextr.get_reduced_dataframe(case_vcf=case_vcf,
                                                         ctrl_vcf=ctrl_vcf)

            dataframe_sift = self.add_sift_score(dataframe_base, 'our')

            dataframe_sift['CHROM'] = pd.to_numeric(
                dataframe_sift[['CHROM']].apply(self.translate_chrom, axis=1))

            dataframe_cadd = dataframe_base

            dataframe_cadd = dataframe_cadd[[
                'CHROM', 'POS', 'ID', 'REF', 'ALT'
            ]]

            dataframe_cadd = self.add_cadd_score(dataframe_cadd)

            dataframe = dataframe_cadd
            dataframe[['SIFT_SCORE', 'SIFT_SUCC'
                       ]] = dataframe_sift[['SIFT_SCORE', 'SIFT_SUCC']]

            dataframe['SIFT_SUCC'] = dataframe['SIFT_SUCC'].fillna(value=0)
            dataframe['SIFT_SCORE'] = dataframe['SIFT_SCORE'].fillna(
                value=0.05)
            dataframe['CADD_SUCC'] = dataframe['CADD_SUCC'].fillna(value=0)
            dataframe['CADD_PHRED'] = dataframe['CADD_PHRED'].fillna(value=30)

            dataframe = self.encode_ref_alt(dataframe)

            dataframe = dataframe[[
                'CHROM', 'POS', 'VAR', 'CADD_PHRED', 'CADD_SUCC', 'SIFT_SCORE',
                'SIFT_SUCC'
            ]]

            cols = [
                'CHROM', 'POS', 'VAR', 'CADD_PHRED', 'CADD_SUCC', 'SIFT_SCORE',
                'SIFT_SUCC'
            ]

            dataframe[cols] = dataframe[cols].apply(pd.to_numeric,
                                                    errors='coerce',
                                                    axis=1)

            log.info(
                "Test set is stored as /input/download_cache/test.csv. "
                "If you want to create a new test set for another vcf dataset you have to delete the previous one "
            )

            dataframe.to_csv(file_name, sep='\t')

        # load stored test set
        else:
            log.info("load stored test set")
            dataframe = pd.read_csv(file_name, sep='\t', index_col='CPA_ID')

        return dataframe
Exemplo n.º 23
0
def process(*, case_vcf: Path, ctrl_vcf: Path, out_dir: Path):
    from idiva.io import open_maybe_gz
    from idiva.io import head

    with open_maybe_gz(case_vcf) as case_full, open_maybe_gz(ctrl_vcf) as ctrl_full:
        assert isinstance(case_full, io.TextIOBase)
        assert isinstance(ctrl_full, io.TextIOBase)

        with head(case_full) as case_head, head(ctrl_full) as ctrl_head:
            log.info("======================")
            log.info("Processing VCF (HEAD).")
            log.info("======================")
            post(process_vcf(case=ReadVCF(case_head), ctrl=ReadVCF(ctrl_head), out=(out_dir / "head")))

        log.info("======================")
        log.info("Processing VCF (FULL).")
        log.info("======================")
        post(process_vcf(case=ReadVCF(case_full), ctrl=ReadVCF(ctrl_full), out=(out_dir / "full")))
Exemplo n.º 24
0
        info_supp = c5_df(case)

    info_meta = []

    # # #

    from idiva.clf.placeholder import placeholder, failure
    from idiva.stat.vcf_to_fisher import vcf_to_fisher
    from idiva.db.sc2disease import allgwas
    from idiva.db.db_clf import db_classifier

    classifiers = [vcf_to_fisher, allgwas, db_classifier]

    for classifier in classifiers:
        with case.rewind_when_done:
            log.info(F"=> Invoking the annotation `{classifier.__name__}`.")
            try:
                response = classifier(case=case, ctrl=ctrl)
                info_meta.append(response.info)

                assert set(response.id_cols).issubset(set(info_supp.columns))
                df = response.df[set(response.id_cols) | set(response.info.keys())]

                info_supp = join(case=info_supp, ctrl=df, how="left", on=list(response.id_cols))

                del response
            except KeyboardInterrupt:
                raise
            except Exception as ex:
                log.exception(F"=> Annotation `{classifier.__name__}` failed ({ex}).")
            else:
Exemplo n.º 25
0
    def add_sift_score(self, dataframe: pd.DataFrame,
                       type: str) -> pd.DataFrame:
        """
        Appends sift scores and success to dataframe
        The dataframe needs at least following columns: CHROM, POS, REF, ALT
        """
        self.setup_sift()

        log.info("creating sift scores for " + type)
        # https://github.com/pauline-ng/SIFT4G_Annotator/raw/master/SIFT4G_Annotator.jar
        # make dataframe compatible for sift annotator
        if 'ID' not in dataframe:
            dataframe['ID'] = '.'
        if 'QUAL' not in dataframe:
            dataframe['QUAL'] = '.'
        if 'FILTER' not in dataframe:
            dataframe['FILTER'] = '.'
        if 'INFO' not in dataframe:
            dataframe['INFO'] = '.'

        dataframe = dataframe.fillna(value=".")

        cache = (Path(__file__).parent.parent.parent.parent /
                 "input/download_cache").resolve()
        assert cache.is_dir()

        file_path = str(cache) + "/" + type + "_sift.vcf"

        # create vcf file
        dataframe.rename(columns={
            'CHROM': '#CHROM'
        }).to_csv(file_path, sep='\t', index=False)

        file_name = type

        sift_folder = str(cache) + "/" + file_name + "_sift"

        cmd = 'java -jar ' + str(
            cache) + '/SIFT4G_Annotator.jar -c -i ' + file_path + ' -d ' + str(
                cache) + '/GRCh37.74 -r ' + sift_folder

        args = shlex.split(cmd)

        process = Popen(args)
        process.wait()

        sift_file = sift_folder + "/" + file_name + "_sift_SIFTannotations.xls"

        sift_dataframe = pd.read_table(sift_file)

        dataframe['SIFT_SCORE'] = np.nan
        dataframe['SIFT_SUCC'] = np.nan

        sift_iter = sift_dataframe.iterrows()
        next_sift = next(sift_iter, None)

        for idx, row in tqdm(dataframe.iterrows(),
                             total=len(dataframe),
                             postfix='inserting sift scores'):

            if next_sift is not None:
                if row['POS'] == next_sift[1]['POS']:
                    if not pd.isna(next_sift[1]['SIFT_SCORE']):
                        sift_score = next_sift[1]['SIFT_SCORE']
                        sift_succ = 1
                    else:
                        sift_score = np.nan
                        sift_succ = 0

                    next_sift = next(sift_iter, None)
                else:
                    sift_score = np.nan
                    sift_succ = 0
            else:
                sift_score = np.nan
                sift_succ = 0

            dataframe.loc[idx, 'SIFT_SCORE'] = sift_score
            dataframe.loc[idx, 'SIFT_SUCC'] = sift_succ

        return dataframe
Exemplo n.º 26
0
    def preprocess_clinvar(self, which='vcf_37'):

        log.info("preprocessing clinvar file")

        dataframe_base = self.get_clinvar_clf_extended()
        dataframe_base = dataframe_base.reset_index(drop=True)

        labels = dataframe_base['labels']
        dataframe_base = dataframe_base.drop('labels', axis=1)

        dataframe_sift = self.add_sift_score(dataframe_base, 'clinvar')

        dataframe_sift['CHROM'] = pd.to_numeric(
            dataframe_sift[['CHROM']].apply(self.translate_chrom, axis=1))

        dataframe_cadd = dataframe_base

        dataframe_cadd['CHROM'] = pd.to_numeric(
            dataframe_cadd[['CHROM']].apply(self.translate_chrom, axis=1))
        dataframe_cadd = dataframe_cadd[['CHROM', 'POS', 'ID', 'REF', 'ALT']]
        dataframe_cadd = dataframe_cadd.reset_index(drop=True)

        cache = (Path(__file__).parent.parent.parent.parent /
                 "input/download_cache").resolve()
        assert cache.is_dir()

        # get cadd annotations
        from idiva.download import download

        data = download(
            'https://polybox.ethz.ch/index.php/s/GRgDYHOAaw75D60/download').now
        with data.open() as fd:
            cadd_scores = pd.read_csv(fd,
                                      sep='\t',
                                      usecols=range(1, 6),
                                      comment='#',
                                      names=[
                                          'CHROM', 'POS', 'REF', 'ALT',
                                          'CADD_SCORE', 'CADD_PHRED'
                                      ])

        cadd_scores['CADD_SUCC'] = 1

        dataframe_cadd[['CADD_PHRED', 'CADD_SUCC'
                        ]] = cadd_scores[['CADD_PHRED', 'CADD_SUCC']]

        dataframe = dataframe_cadd
        dataframe[['SIFT_SCORE',
                   'SIFT_SUCC']] = dataframe_sift[['SIFT_SCORE', 'SIFT_SUCC']]

        dataframe['SIFT_SCORE'] = dataframe['SIFT_SCORE'].fillna(value=0.05)
        dataframe['CADD_SUCC'] = dataframe['CADD_SUCC'].fillna(value=0)
        dataframe['CADD_PHRED'] = dataframe['CADD_PHRED'].fillna(value=30)

        dataframe = self.encode_ref_alt(dataframe)

        dataframe = dataframe[[
            'CHROM', 'POS', 'VAR', 'CADD_PHRED', 'CADD_SUCC', 'SIFT_SCORE',
            'SIFT_SUCC'
        ]]

        dataframe.insert(7, 'label', labels)

        cols = [
            'CHROM', 'POS', 'VAR', 'CADD_PHRED', 'CADD_SUCC', 'SIFT_SCORE',
            'SIFT_SUCC', 'label'
        ]

        dataframe[cols] = dataframe[cols].apply(pd.to_numeric,
                                                errors='coerce',
                                                axis=1)

        file_path = str(cache) + "/training.csv"

        dataframe.to_csv(file_path, sep='\t', index=False)

        return dataframe
Exemplo n.º 27
0
# RA, 2020-12-04

# Template for a classifier interface to main.
# Make a copy for your implementation.

import idiva.io
from idiva import log


def placeholder(*, case: idiva.io.ReadVCF, ctrl: idiva.io.ReadVCF):
    from idiva.clf.df import c5_df

    log.info("Running the `placeholder`.")

    # DUMMY result dataframe
    result = c5_df(case).assign(Dummy1=1, Dummy2=2.0)

    # The result should contain the columns
    #   CHROM, POS, ID
    # or
    #   CHROM, POS, ALT
    # or
    #   CHROM, POS, REF, ALT
    # but can have fewer rows than `case`.
    #
    # Specify these columns in `id_cols`.

    class response:
        id_cols = ["CHROM", "POS", "ID"]

        info = {
Exemplo n.º 28
0
 def save_experiment(self, values: dict):
     log.info(f'Writing to experiment dataframe: {values}')
     values['experiment_uid'] = self.experiment_uid
     experiments_dataframe = self.experiments_dataframe.append(
         values, ignore_index=True)
     experiments_dataframe.to_csv(self.experiment_df, index=False)
import typing
import re
import pandas
import idiva.io
import idiva.utils

from idiva import log
from tqdm import tqdm


def vcf_to_fisher(*, case: idiva.io.ReadVCF, ctrl: idiva.io.ReadVCF):
    from idiva.io import cache_df
    from idiva.clf.df import v0_df, join
    from idiva.stat import v0_fisher

    log.info("Creating a slim case/ctrl dataframe.")

    def df_maker1():
        # Note: v0_df is parallelized
        return join(case=v0_df(case), ctrl=v0_df(ctrl))

    cached_file_prefix = "case_ctrl__v0df"

    df = cache_df(cached_file_prefix,
                  key=[case.md5, ctrl.md5],
                  df_maker=df_maker1)
    log.debug(df)

    log.info("Computing the p-values case vs control.")

    def df_maker2():
Exemplo n.º 30
0
# HK, 2020-12-08

import idiva.io
from idiva import log
from idiva.clf.utils import get_trained_phenomenet
from idiva.dh import datahandler


def phenomenet_classifier_basic(*, case: idiva.io.ReadVCF,
                                ctrl: idiva.io.ReadVCF) -> object:
    """
    Classifies the case-control df with a pretrained classifier.
    """
    from idiva.clf.df import c5_df

    log.info("Running the phenomenet classifier.")

    model = get_trained_phenomenet(which='exp_2020_12_08_11_05_12_615223')

    case_control = c5_df(case)

    case_control['var'] = case_control[['REF', 'ALT']].apply(
        lambda x: datahandler.MAPPING[x[0]][x[1]], axis=1)
    clf_data = case_control[['CHROM', 'POS', 'var']]

    # columns need to be in the same order than they were for the training of the classifier:
    clf_data = clf_data.reindex(sorted(clf_data.columns), axis=1)
    predictions = model.predict(clf_data.to_numpy().astype('float32'))
    case_control['phenom_class'] = predictions

    class response: