Exemplo n.º 1
0
def generic_compute_shaps(fasta_files, genotype, classifier, n_samples, verb):
    """
    Given a genotype file and/or a collection of possibly gzipped FASTA files as well as a
    phenotrex classifier, collect genotype information from both, get SHAP information about the
    genotypes using the classifier, and return a finished ShapHandler object as well as the list
    of GenotypeRecords created.
    """
    if not len(fasta_files) and genotype is None:
        raise RuntimeError(
            'Must either supply FASTA file(s) or single genotype file for prediction.')
    if len(fasta_files):
        grs_from_fasta = fastas_to_grs(fasta_files, n_threads=None, verb=verb)
    else:
        grs_from_fasta = []

    grs_from_file = load_genotype_file(genotype) if genotype is not None else []
    gr = grs_from_fasta + grs_from_file

    model = load_classifier(filename=classifier, verb=verb)
    sh = ShapHandler.from_clf(model)
    try:
        fs, sv, bv = model.get_shap(gr, n_samples=n_samples)
    except TypeError:
        raise RuntimeError('This TrexClassifier is not capable of generating SHAP explanations.')
    sh.add_feature_data(sample_names=[x.identifier for x in gr],
                        features=fs, shaps=sv, base_value=bv)
    return sh, gr
Exemplo n.º 2
0
def get_weights(classifier, out):
    """
    Write the feature weights of a classifier to a flat file.
    """
    from phenotrex.io.flat import write_weights_file
    from phenotrex.io.serialization import load_classifier

    clf = load_classifier(filename=classifier, verb=True)
    weights = clf.get_feature_weights()
    write_weights_file(weights_file=out, weights=weights)
Exemplo n.º 3
0
 def test_predict_from_fasta(self, trait_name, classifier_type, fasta_files):
     model_path = MODELS_PATH/trait_name/f'{trait_name}.{classifier_type.lower()}.pkl'
     with TemporaryDirectory() as tmpdir:
         tmp_path = Path(tmpdir)/'model.pkl'
         summary_path = Path(tmpdir)/'summary.tsv' if classifier_type == 'XGB' else None
         per_sample_path = Path(tmpdir)/'per_sample.tsv' if classifier_type == 'XGB' else None
         clf = load_classifier(model_path)
         save_classifier(clf, tmp_path)
         pred = predict(
             fasta_files=fasta_files, classifier=tmp_path,
             out_explain_summary=summary_path,
             out_explain_per_sample=per_sample_path
         )
         assert summary_path is None or summary_path.is_file()
         assert per_sample_path is None or per_sample_path.is_file()
         print(pred)
Exemplo n.º 4
0
    def test_get_shap(self, trait_name, classifier_type):
        """
        Get ShapHandler and SHAP data from classifier and genotype file.

        :param trait_name:
        :param classifier_type:
        :return:
        """
        full_path_genotype = FLAT_PATH / trait_name / f"{trait_name}.genotype"
        full_path_phenotype = FLAT_PATH / trait_name / f"{trait_name}.phenotype"
        training_records, genotype, phenotype, group = load_training_files(
            genotype_file=full_path_genotype,
            phenotype_file=full_path_phenotype,
            verb=True)
        tr = training_records[:3]
        model_path = MODELS_PATH / trait_name / f'{trait_name}.{classifier_type.lower()}.pkl'
        clf = load_classifier(model_path, verb=True)
        sh = ShapHandler.from_clf(clf)
        fs, sv, bv = clf.get_shap(tr, n_samples=50)
        return tr, sh, fs, sv, bv
Exemplo n.º 5
0
def predict(input_files: List[str],
            classifier: str,
            min_proba=0.0,
            verb=True) -> pd.DataFrame:
    if not len(input_files):
        raise RuntimeError('Must supply input file(s) for prediction.')
    fasta_files, genotype_files = _determine_file_types(input_files)
    gr = []
    if fasta_files:
        gr += fastas_to_grs(fasta_files, n_threads=None, verb=verb)
    for f in genotype_files:
        gr += load_genotype_file(f)
    model = load_classifier(filename=classifier, verb=verb)
    preds, probas = model.predict(X=gr)
    translate_output = {
        trait_id: trait_sign
        for trait_sign, trait_id in DEFAULT_TRAIT_SIGN_MAPPING.items()
    }
    out = {}
    for record, result, probability in zip(gr, preds, probas):
        if probability[result] < min_proba:
            result_disp = np.nan
        else:
            result_disp = translate_output[result]
        out[record.identifier] = {
            'Trait Present': result_disp,
            'Confidence': str(round(probability[result], 4))
        }

    df = pd.DataFrame.from_dict(out).T
    df.index.name = 'Genome'
    df = df.reset_index()
    df['Trait Name'] = model.trait_name
    df['Feature Type'] = model.feature_type
    df['Model File'] = Path(classifier).name
    df = df[[
        'Genome', 'Model File', 'Feature Type', 'Trait Name', 'Trait Present',
        'Confidence'
    ]]
    return df
Exemplo n.º 6
0
def predict(fasta_files=tuple(),
            genotype=None,
            classifier=None,
            min_proba=0.5,
            out_explain_per_sample=None,
            out_explain_summary=None,
            shap_n_samples=None,
            n_max_explained_features=None,
            verb=False):
    """
    Predict phenotype from a set of (possibly gzipped) DNA or protein FASTA files
    or a single genotype file. Optionally, compute SHAP explanations individually and/or summarily
    for the predicted samples.

    NB: Genotype computation is highly expensive and performed on the fly on FASTA files.
    For increased speed when predicting multiple phenotypes, create a .genotype file to reuse
    with the command `compute-genotype`.

    NB: As opposed to XGB models where they are trivially available, computing SHAP explanations
    on SVM models entails training a model-agnostic KernelExplainer which is highly costly (dozens
    to hundreds of seconds per sample if using a somewhat reasonable value for `shap_n_samples`).

    :param fasta_files: An iterable of fasta file paths
    :param genotype: A genotype file path
    :param classifier: A pickled classifier file path
    :param out_explain_per_sample: Where to save the most influential features by SHAP for each
                                   predicted sample.
    :param out_explain_summary: Where to save the SHAP summary of the predictions.
    :param shap_n_samples: The n_samples parameter -
                           only used by models which incorporate a `shap.KernelExplainer`.
    :param n_max_explained_features: How many of the most influential features by SHAP to consider.
    :param verb: Whether to show progress of fasta file annotation.
    """
    if not len(fasta_files) and genotype is None:
        raise RuntimeError(
            'Must supply FASTA file(s) and/or single genotype file for prediction.'
        )
    if len(fasta_files):
        grs_from_fasta = fastas_to_grs(fasta_files, n_threads=None, verb=verb)
    else:
        grs_from_fasta = []

    grs_from_file = load_genotype_file(
        genotype) if genotype is not None else []
    gr = grs_from_fasta + grs_from_file

    model = load_classifier(filename=classifier, verb=verb)
    if out_explain_per_sample is not None or out_explain_summary is not None:
        try:
            fs, sv, bv = model.get_shap(gr, n_samples=shap_n_samples)
        except TypeError:
            raise RuntimeError(
                'This TrexClassifier is not capable of generating SHAP explanations.'
            )
        sh = ShapHandler.from_clf(model)
        sh.add_feature_data(sample_names=[x.identifier for x in gr],
                            features=fs,
                            shaps=sv,
                            base_value=bv)
        if out_explain_per_sample is not None:
            shap_df = pd.concat([
                sh.get_shap_force(x.identifier,
                                  n_max_features=n_max_explained_features)
                for x in gr
            ],
                                axis=0)
            shap_df.to_csv(out_explain_per_sample, sep='\t', index=False)
        if out_explain_summary is not None:
            sum_df = sh.get_shap_summary(n_max_explained_features)
            sum_df.to_csv(out_explain_summary, sep='\t', index=False)

    preds, probas = model.predict(X=gr)
    translate_output = {
        trait_id: trait_sign
        for trait_sign, trait_id in DEFAULT_TRAIT_SIGN_MAPPING.items()
    }
    print(f"# Trait: {model.trait_name}")
    print("Identifier\tTrait present\tConfidence")
    for record, result, probability in zip(gr, preds, probas):
        if probability[result] < min_proba:
            result_disp = "N/A"
        else:
            result_disp = translate_output[result]
        print(
            f"{record.identifier}\t{result_disp}\t{str(round(probability[result], 4))}"
        )
Exemplo n.º 7
0
 def test_ml_load(self):
     td = self.get_training_data()
     xgb = load_classifier(MODELS_PATH/trait_name/f'{trait_name}.xgb.pkl')
     preds = xgb.predict(td)
     assert preds is not None