def load_classifier(path=False): """ Load the ALLSorts classifier from a pickled file. ... Parameters __________ path : str Path to a pickle object that holds the ALLSorts model. Default: "/models/allsorts/allsorts.pkl.gz" Returns __________ allsorts_clf : ALLSorts object ALLSorts object, unpacked, ready to go. """ if not path: path = str(root_dir()) + "/models/allsorts/allsorts.pkl.gz" message("Loading classifier...") allsorts_clf = joblib.load(path) return allsorts_clf
def transform(self, counts, y=False): """ Pre-process input counts as per parameters determined by fit(). Parameters __________ counts: Pandas DataFrame The training counts (samples/rows x genes/columns) """ counts.index = counts.index.astype("str") ''' Filter genes ''' if self.filter_genes: counts = counts.reindex(self.genes, axis=1) ''' Normalise with TMM ''' if self.norm == "TMM": counts = self.tmm_norm.transform(counts) ''' Check for missing genes ''' missing_genes = list(set(self.genes).difference(counts.columns)) if len(missing_genes) > 0: message( "Note: " + str(len(missing_genes)) + " genes not found in supplied samples, filling with zeroes.\n" + "This WILL impact classification performance.\n" + "Follow the counts guide on Github (http://) to resolve.", level="w") return counts
def run_predictions(ui, allsorts): """ This is what we are here for. Use ALLSorts to make predictions! ... Parameters __________ ui : User Input Class Carries all information required to execute ALLSorts, see UserInput class for further information. Output __________ Probabilities.csv, Predictions.csv, Distributions.png, Waterfalls.png at the ui.destination path. """ predictions, probabilities = get_predictions(ui.samples, allsorts, parents=ui.parents) probabilities["Pred"] = list(predictions["Prediction"]) if not isinstance(ui.labels, bool): probabilities["True"] = ui.labels probabilities.round(3).to_csv(ui.destination + "/probabilities.csv") predictions.to_csv(ui.destination + "/predictions.csv") if "B-ALL" in probabilities.columns: get_figures(ui.samples, allsorts, ui.destination, probabilities.drop("B-ALL", axis=1)) else: get_figures(ui.samples, allsorts, ui.destination, probabilities) message("Finished. Thanks for using ALLSorts!")
def __init__(self): if self._is_cli(): self.cli = True self.input = self._get_args() self.samples = self.input.samples self.labels = self.input.labels if self.input.labels else False self.model_dir = str( root_dir() ) + "/models/allsorts/" if not self.input.model_dir else self.input.model_dir self.destination = False if not self.input.destination else self.input.destination self.test = self.input.test self.train = False if not self.input.train else True self.comparison = False if not self.input.comparison else True self.n_jobs = 1 if not self.input.njobs else int(self.input.njobs) self.verbose = False if not self.input.verbose else True self.force = False if not self.input.force else True self.cv = 3 if not self.input.cv else int(self.input.cv) self.parents = False if not self.input.parents else True self.ball = self.input.ball self._input_checks() self._load_samples() else: message( "No arguments supplied. Please use allsorts --help for further information about input." ) sys.exit(0)
def get_figures(samples, allsorts, destination, probabilities, plots=["distributions", "waterfalls"]): """ Make figures of the results. ... Parameters __________ samples : Pandas DataFrame Pandas DataFrame that represents the raw counts of your samples (rows) x genes (columns)). destination : str Location of where the results should be saved. probabilities : Pandas DataFrame The result of running the get_predictions(samples, labels=False, parents=False) function. See function for further usage. plots : List List of plots required. Default: "distributions", "waterfalls", and "manifold". See https://github.com/Oshlack/AllSorts/ for examples. Output __________ Distributions.png, Waterfalls.png, Manifold.png at the ui.destination path. """ message("Saving figures...") for plot in plots: if plot == "distributions": dist_plot = allsorts.predict_dist(probabilities, return_plot=True) dist_plot.savefig(destination + "/distributions.png") if plot == "waterfalls": if "True" in probabilities.columns: comparisons = False else: comparisons = pd.read_csv(str(root_dir()) + "/models/allsorts/comparisons.csv", index_col=0) waterfall_plot = allsorts.predict_waterfall(probabilities, compare=comparisons, return_plot=True) waterfall_plot.savefig(destination + "/waterfalls.png") if plot == "manifold": umap_plot = allsorts.predict_plot(samples, return_plot=True) umap_plot.savefig(destination + "/manifold.png")
def _input_checks(self): if self.train and not (self.labels and self.samples): message( "Error: if -train is set both -labels/-l, -params/-p, -samples/-s must be also. Exiting." ) sys.exit() if not self.train and not self.destination: message( "Error: if -train is not set a destination (-d /path/to/output/) is required. Exiting." ) sys.exit()
def run(ui=False): """ A function that runs ALLSorts in one of three modes: Training, Comparison adding, Prediction. - The Training mode will replace the model in the installed directory. - The Prediction mode will output a set of predictions and visualisations as per an input set of samples. - The Comparison mode will build comparisons from a supplied set of samples and labels to which to compare all new predictions. I.e. when no labels are u ... Parameters __________ ui : User Input Class Carries all information required to execute ALLSorts, see UserInput class for further information. """ if not ui: ui = UserInput() message(allsorts_asci) if ui.train: message("Training Mode", level=1) train_time = time.time() train(ui=ui) message("Total Train time " + str(round(time.time() - train_time, 2))) # Seconds elif ui.comparison: message("Rebuilding Comparisons", level=1) allsorts_clf = load_classifier() allsorts_clf = _set_njobs(ui.n_jobs, allsorts_clf) allsorts_clf.steps[-1][ -1].filter_healthy = True if ui.ball == "True" else False run_comparison_builder(ui, allsorts_clf) else: message("Prediction Mode", level=1) allsorts_clf = load_classifier() allsorts_clf = _set_njobs(ui.n_jobs, allsorts_clf) allsorts_clf.steps[-1][ -1].filter_healthy = True if ui.ball == "True" else False run_predictions(ui, allsorts_clf)
def get_predictions(samples, allsorts, labels=False, parents=False): """ Given a set of samples use ALLSorts to return a set of predictions and probabilities. ... Parameters __________ samples : Pandas DataFrame Pandas DataFrame that represents the raw counts of your samples (rows) x genes (columns)). labels : Pandas Series Pandas series that has a label associate with each sample. parents : bool True/False as to whether to include parents in the hierarchy in the output, i.e. Ph Group. Returns __________ predictions: Pandas DataFrame A predictions for each inputted sample. probabilities: Pandas DataFrame Probabilities returned by ALLSorts for each prediction - samples (rows) x subtype/meta-subtype (columns) Note: These do not have to add to 1 column-wise - see paper (when it is released!) """ message("Making predictions...") probabilities = allsorts.predict_proba(samples, parents=parents) if "B-ALL" in probabilities.columns: predictions = pd.DataFrame(allsorts.predict(probabilities.drop("B-ALL", axis=1), probabilities=True, parents=parents), columns=["Prediction"], index=samples.index) else: predictions = pd.DataFrame(allsorts.predict(probabilities, probabilities=True, parents=parents), columns=["Prediction"], index=samples.index) if isinstance(labels, pd.Series): probabilities["True"] = labels return predictions, probabilities
def run_comparison_builder(ui, allsorts): """ Build comparison results to compare to future predictions. I.e. what the waterfall plot displays in addition to the predicted sample probabilities. ... Parameters __________ ui : User Input Class Carries all information required to execute ALLSorts, see UserInput class for further information. """ predictions, probabilities = get_predictions(ui.samples, allsorts, labels=ui.labels, parents=True) probabilities["Pred"] = list(predictions["Prediction"]) message("Building comparisons...") rebuild_comparisons(allsorts, probabilities, ui) message("Finished.")
def train(ui=False): ''' TRAINING A MODEL (OUTER LOOP) -- This operation requires two steps: 1. With a tuned estimator returned from inner loop, calibrate optimal thresholds. 2. Score this method 3. Train a final model, using the average of the final thresholds. ''' message("Cross Validation (this will take awhile):", level=2) # Create results path search_path = ui.model_dir + "gridsearch/" create_dir([ui.model_dir, search_path]) # CV results subtypes = list(ui.labels.unique()) thresholds_cv = {} results_cv = {} results_cv["accuracy"] = [] results_cv["precision"] = [] results_cv["recall"] = [] results_cv["f1"] = [] # First we need to fight out for the model of choice # Now we need to figure out thresholds for fold in range(1, ui.cv + 1): message("Fold: " + str(fold)) seed = np.random.randint(1, 1000) x_train, x_test, y_train, y_test = train_test_split(ui.samples, ui.labels, stratify=ui.labels, test_size=0.2, random_state=seed) # Inner loop (hyperparameter tuning) allsorts_clf_fold = _tune(ui, x_train, y_train, fold=fold) probabilities = allsorts_clf_fold.predict_proba(x_test, parents=True) f_hierarchy = allsorts_clf_fold.steps[-1][-1].f_hierarchy # Optimise Prediction Thresholds thresholds = fit_thresholds(probabilities, f_hierarchy, y_test) allsorts_clf_fold.steps[-1][-1].thresholds = thresholds for subtype, fold_thresh in thresholds.items(): if subtype in thresholds_cv.keys(): thresholds_cv[subtype].append(fold_thresh) else: thresholds_cv[subtype] = [fold_thresh] # Score fold y_pred = allsorts_clf_fold.predict(x_test, parents=True) hierarchy = { "High Sig": { "High hyperdiploid": False, 'Low hyperdiploid': False, "Near haploid": False }, 'Low hypodiploid': False, 'iAMP21': False, 'NUTM1': False, 'BCL2/MYC': False, 'TCF3-PBX1': False, 'MEF2D': False, 'HLF': False, 'IKZF1 N159Y': False, 'PAX5 P80R': False, 'Ph Group': { "Ph-like": False, "Ph": False }, "PAX5alt": False, 'ETV6-RUNX1 Group': { 'ETV6-RUNX1': False, 'ETV6-RUNX1-like': False }, 'ZNF384 Group': False, 'KMT2A Group': False, 'DUX4': False } f_hierarchy = _flat_hierarchy(hierarchy, flat_hierarchy={}) probs = allsorts_clf_fold.predict_proba(x_test, parents=True) print(probs) print(f_hierarchy) fold_preds(y_test, probs, f_hierarchy=f_hierarchy) results_cv["accuracy"].append(round(accuracy_score(y_test, y_pred), 4)) results_cv["precision"].append( round( precision_score(y_test, y_pred, average="weighted", zero_division=0, labels=subtypes), 4)) results_cv["recall"].append( round( recall_score(y_test, y_pred, average="weighted", zero_division=0, labels=subtypes), 4)) results_cv["f1"].append( round( f1_score(y_test, y_pred, average="weighted", zero_division=0, labels=subtypes), 4)) # Train final model using all samples allsorts_clf = _tune(ui, ui.samples, ui.labels) test = allsorts_clf.transform(ui.samples) test["True"] = ui.labels test["counts"].to_csv("normed_counts.csv") # Average thresholds thresholds = {} for subtype, sub_thresh in thresholds_cv.items(): thresholds[subtype] = round(sum(sub_thresh) / len(sub_thresh), 4) allsorts_clf.steps[-1][-1].thresholds = thresholds # Save results and model scores = pd.DataFrame(results_cv, index=list(range(1, ui.cv + 1))) scores.to_csv(ui.model_dir + "cross_val_results.csv") save_path_model = ui.model_dir + "allsorts.pkl.gz" message("Saving model to: " + save_path_model) allsorts_clf.save(path=save_path_model)
def _tune(ui, x_train, y_train, fold="all"): ''' TUNING A MODEL (INNER LOOP) -- This operation requires two steps: 1. Construct a pipeline using the ALLSorts class (A Sklearn pipeline extension). 2. Gridsearch the parameter space that is outlined. Currently this is achieved by editing this function. Although, in future, this will be included within a a passable JSON file that contains the below. Given that this is likely only to be run once in a blue moon, this is not a priority. For those wishing to use the ALLSorts model, with some substitutions of algorithms, simply edit this file after making a copy of the original (save it somewhere so you can always revert). Note, setting up an ALLSorts pipeline and grid search is identical to setting up a usual sklearn pipeline. For more information on how to achieve this visit: https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html ''' hierarchy = { "High Sig": { "High hyperdiploid": False, 'Low hyperdiploid': False, "Near haploid": False }, 'Low hypodiploid': False, 'iAMP21': False, 'NUTM1': False, 'BCL2/MYC': False, 'TCF3-PBX1': False, 'MEF2D': False, 'HLF': False, 'IKZF1 N159Y': False, 'PAX5 P80R': False, 'Ph Group': { "Ph-like": False, "Ph": False }, "PAX5alt": False, 'ETV6-RUNX1 Group': { 'ETV6-RUNX1': False, 'ETV6-RUNX1-like': False }, 'ZNF384 Group': False, 'KMT2A Group': False, 'DUX4': False } f_hierarchy = _flat_hierarchy(hierarchy, flat_hierarchy={}) ### ADD Some features we want fusion_list: List[Any] = [] fusion_list += ["BCR_ABL1"] fusion_list += ["ETV6_RUNX1"] fusion_list += ["TCF3_PBX1", "TCF3_HLF"] # Set parameters to be used in GridSearchCV lr = LogisticRegression(penalty="l1", solver="liblinear", max_iter=500, multi_class="auto", class_weight="balanced") lr_params = [{"C": 0.3}] standard_params = [] classifier = HierarchicalClassifier() # Create parameter grid for input into GridSearchCV allsorts_params = [ # Hierarchy { 'standardisation': [Scaler(scaler="std")], 'centroids': [ CentroidCreate(hierarchy=hierarchy, distance_function=euclidean_distances) ], 'feature_select': [FeatureSelection(hierarchy=hierarchy, method="all", test=False)], 'feature_create__chrom_feature': [True], 'feature_create__iamp21_feature': [True], 'feature_create__fusion_feature': [True], 'train_model__hierarchy': [hierarchy], 'train_model__model': [lr], 'train_model__params': lr_params } ] # Note: Once benchmarks per cpu is made, estimate time of compute and distribute it accordingly training_x_models = len(list(ParameterGrid(allsorts_params))) grid_search_cv = 2 if training_x_models * grid_search_cv >= ui.n_jobs: grid_jobs = ui.n_jobs stage_jobs = 1 else: grid_jobs = 1 stage_jobs = ui.n_jobs # Create Pipeline allsorts_pipe = ALLSorts( [("preprocess", Preprocessing(filter_genes=True, norm="TMM")), ("feature_create", FeatureCreation( n_jobs=stage_jobs, kernel_div=30, fusions=fusion_list)), ("standardisation", Scaler()), ("feature_select", FeatureSelection()), ("centroids", CentroidCreate()), ("train_model", classifier)], verbose=ui.verbose) # Check with user whether they want to train this many models if fold == 1: message("Important: Training " + str(training_x_models) + " models (" + str(grid_search_cv * ui.cv * training_x_models) + " with cross validation).", important=True) # Perform Grid Search - Likely to take a lot of time. allsorts_grid = GridSearchCV(allsorts_pipe, param_grid=allsorts_params, cv=grid_search_cv, n_jobs=grid_jobs, scoring="balanced_accuracy").fit( x_train, y_train) grid_results = _grid_save(allsorts_grid) grid_results.to_csv(ui.model_dir + "gridsearch/gridsearch_fold" + str(fold) + ".csv") # Pick the estimator that maximised the score in our gridsearchcv allsorts_clf = allsorts_grid.best_estimator_ return allsorts_clf