def generic_compute_shaps(fasta_files, genotype, classifier, n_samples, verb): """ Given a genotype file and/or a collection of possibly gzipped FASTA files as well as a phenotrex classifier, collect genotype information from both, get SHAP information about the genotypes using the classifier, and return a finished ShapHandler object as well as the list of GenotypeRecords created. """ if not len(fasta_files) and genotype is None: raise RuntimeError( 'Must either supply FASTA file(s) or single genotype file for prediction.') if len(fasta_files): grs_from_fasta = fastas_to_grs(fasta_files, n_threads=None, verb=verb) else: grs_from_fasta = [] grs_from_file = load_genotype_file(genotype) if genotype is not None else [] gr = grs_from_fasta + grs_from_file model = load_classifier(filename=classifier, verb=verb) sh = ShapHandler.from_clf(model) try: fs, sv, bv = model.get_shap(gr, n_samples=n_samples) except TypeError: raise RuntimeError('This TrexClassifier is not capable of generating SHAP explanations.') sh.add_feature_data(sample_names=[x.identifier for x in gr], features=fs, shaps=sv, base_value=bv) return sh, gr
def get_weights(classifier, out): """ Write the feature weights of a classifier to a flat file. """ from phenotrex.io.flat import write_weights_file from phenotrex.io.serialization import load_classifier clf = load_classifier(filename=classifier, verb=True) weights = clf.get_feature_weights() write_weights_file(weights_file=out, weights=weights)
def test_predict_from_fasta(self, trait_name, classifier_type, fasta_files): model_path = MODELS_PATH/trait_name/f'{trait_name}.{classifier_type.lower()}.pkl' with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir)/'model.pkl' summary_path = Path(tmpdir)/'summary.tsv' if classifier_type == 'XGB' else None per_sample_path = Path(tmpdir)/'per_sample.tsv' if classifier_type == 'XGB' else None clf = load_classifier(model_path) save_classifier(clf, tmp_path) pred = predict( fasta_files=fasta_files, classifier=tmp_path, out_explain_summary=summary_path, out_explain_per_sample=per_sample_path ) assert summary_path is None or summary_path.is_file() assert per_sample_path is None or per_sample_path.is_file() print(pred)
def test_get_shap(self, trait_name, classifier_type): """ Get ShapHandler and SHAP data from classifier and genotype file. :param trait_name: :param classifier_type: :return: """ full_path_genotype = FLAT_PATH / trait_name / f"{trait_name}.genotype" full_path_phenotype = FLAT_PATH / trait_name / f"{trait_name}.phenotype" training_records, genotype, phenotype, group = load_training_files( genotype_file=full_path_genotype, phenotype_file=full_path_phenotype, verb=True) tr = training_records[:3] model_path = MODELS_PATH / trait_name / f'{trait_name}.{classifier_type.lower()}.pkl' clf = load_classifier(model_path, verb=True) sh = ShapHandler.from_clf(clf) fs, sv, bv = clf.get_shap(tr, n_samples=50) return tr, sh, fs, sv, bv
def predict(input_files: List[str], classifier: str, min_proba=0.0, verb=True) -> pd.DataFrame: if not len(input_files): raise RuntimeError('Must supply input file(s) for prediction.') fasta_files, genotype_files = _determine_file_types(input_files) gr = [] if fasta_files: gr += fastas_to_grs(fasta_files, n_threads=None, verb=verb) for f in genotype_files: gr += load_genotype_file(f) model = load_classifier(filename=classifier, verb=verb) preds, probas = model.predict(X=gr) translate_output = { trait_id: trait_sign for trait_sign, trait_id in DEFAULT_TRAIT_SIGN_MAPPING.items() } out = {} for record, result, probability in zip(gr, preds, probas): if probability[result] < min_proba: result_disp = np.nan else: result_disp = translate_output[result] out[record.identifier] = { 'Trait Present': result_disp, 'Confidence': str(round(probability[result], 4)) } df = pd.DataFrame.from_dict(out).T df.index.name = 'Genome' df = df.reset_index() df['Trait Name'] = model.trait_name df['Feature Type'] = model.feature_type df['Model File'] = Path(classifier).name df = df[[ 'Genome', 'Model File', 'Feature Type', 'Trait Name', 'Trait Present', 'Confidence' ]] return df
def predict(fasta_files=tuple(), genotype=None, classifier=None, min_proba=0.5, out_explain_per_sample=None, out_explain_summary=None, shap_n_samples=None, n_max_explained_features=None, verb=False): """ Predict phenotype from a set of (possibly gzipped) DNA or protein FASTA files or a single genotype file. Optionally, compute SHAP explanations individually and/or summarily for the predicted samples. NB: Genotype computation is highly expensive and performed on the fly on FASTA files. For increased speed when predicting multiple phenotypes, create a .genotype file to reuse with the command `compute-genotype`. NB: As opposed to XGB models where they are trivially available, computing SHAP explanations on SVM models entails training a model-agnostic KernelExplainer which is highly costly (dozens to hundreds of seconds per sample if using a somewhat reasonable value for `shap_n_samples`). :param fasta_files: An iterable of fasta file paths :param genotype: A genotype file path :param classifier: A pickled classifier file path :param out_explain_per_sample: Where to save the most influential features by SHAP for each predicted sample. :param out_explain_summary: Where to save the SHAP summary of the predictions. :param shap_n_samples: The n_samples parameter - only used by models which incorporate a `shap.KernelExplainer`. :param n_max_explained_features: How many of the most influential features by SHAP to consider. :param verb: Whether to show progress of fasta file annotation. """ if not len(fasta_files) and genotype is None: raise RuntimeError( 'Must supply FASTA file(s) and/or single genotype file for prediction.' ) if len(fasta_files): grs_from_fasta = fastas_to_grs(fasta_files, n_threads=None, verb=verb) else: grs_from_fasta = [] grs_from_file = load_genotype_file( genotype) if genotype is not None else [] gr = grs_from_fasta + grs_from_file model = load_classifier(filename=classifier, verb=verb) if out_explain_per_sample is not None or out_explain_summary is not None: try: fs, sv, bv = model.get_shap(gr, n_samples=shap_n_samples) except TypeError: raise RuntimeError( 'This TrexClassifier is not capable of generating SHAP explanations.' ) sh = ShapHandler.from_clf(model) sh.add_feature_data(sample_names=[x.identifier for x in gr], features=fs, shaps=sv, base_value=bv) if out_explain_per_sample is not None: shap_df = pd.concat([ sh.get_shap_force(x.identifier, n_max_features=n_max_explained_features) for x in gr ], axis=0) shap_df.to_csv(out_explain_per_sample, sep='\t', index=False) if out_explain_summary is not None: sum_df = sh.get_shap_summary(n_max_explained_features) sum_df.to_csv(out_explain_summary, sep='\t', index=False) preds, probas = model.predict(X=gr) translate_output = { trait_id: trait_sign for trait_sign, trait_id in DEFAULT_TRAIT_SIGN_MAPPING.items() } print(f"# Trait: {model.trait_name}") print("Identifier\tTrait present\tConfidence") for record, result, probability in zip(gr, preds, probas): if probability[result] < min_proba: result_disp = "N/A" else: result_disp = translate_output[result] print( f"{record.identifier}\t{result_disp}\t{str(round(probability[result], 4))}" )
def test_ml_load(self): td = self.get_training_data() xgb = load_classifier(MODELS_PATH/trait_name/f'{trait_name}.xgb.pkl') preds = xgb.predict(td) assert preds is not None