def main(parser): with open(parser.args["roi_masks_path"], "r") as f: raw_roi_data = json.load(f) roi_data = SparseAndDenseROISchema(many=True).load(raw_roi_data) roi_data, excluded_rois = filter_excluded_rois(roi_data) rois, metadata, traces, _ = _munge_data(parser, roi_data) # TODO: add neuropil traces later logger.info(f"Extracting features from '{len(rois)}' ROIs") features = FeatureExtractor(rois, traces, metadata).run() logger.info(f"Using the following classifier model: " f"{parser.args['classifier_model_path']}") model = load_model(parser.args["classifier_model_path"]) logger.info("Classifying ROIs with features") predictions = model.predict(features) if len(predictions) != len(roi_data): raise ValueError( f"Expected the number of predictions ({len(predictions)}) to " f"equal the number of input ROIs ({len(roi_data)}), but they " "are not the same.") for obj, prediction in zip(roi_data, predictions): if prediction == 0: obj["exclusion_labels"].append(NOT_CELL_EXCLUSION_LABEL) obj["valid_roi"] = False roi_data.extend(excluded_rois) output_data = { "classified_rois": roi_data, "classifier_model_path": parser.args["classifier_model_path"] } parser.output(output_data) logger.info("ROI classification successfully completed!")
def test_feat_fns_by_source(sources, expected, function_class): my_class = function_class() actual = fx._feat_fns_by_source(my_class, sources=sources) # Pytest doesn't have nice stuff for dicts... much less dicts with lists for k, vals in expected.items(): assert k in actual assert len(actual[k]) == len(vals) for v in vals: assert v in actual[k]
def test_train_classifier_binary_preds(model, train_data, test_data, drop_cols, tmp_path): """tests that `train_classifier()` generates a classifier that makes binary predictions. """ clf, _, _, _ = train.train_classifier(model=model, training_data_path=train_data, test_data_path=test_data, scorer='roc_auc', max_iter=2, optimizer="rand", n_folds=2, refit=True, drop_cols=drop_cols) # load testing roi and generate input features with open(test_data, 'r') as open_testing: testing_data = json.load(open_testing) features = FeatureExtractor.from_list_of_dict(testing_data).run() predictions = clf.predict(features) assert set(predictions).issubset({0, 1})
def train_classifier(model: str, training_data_path: str, test_data_path: str, scorer: Union[str, Callable], max_iter: int, optimizer: str, test_metrics: Optional[List[str]] = None, n_folds: int = 5, seed: int = 42, refit: bool = True, drop_cols: List[str] = None): """Tunes and trains a model using hyperopt to optimize hyperparameters. Internally uses k-fold cross validation to compute optimization metrics. Uses `feature_pipeline` to preprocess data. The trained and tuned classifier is appended onto the pipeline as the final step. Parameters ---------- model: str The model algorithm to use. One of "RandomForestClasifier" or "LogisticRegression". See `TrainingSchema.MODEL_CHOICES`. training_data_path: str local path or s3 URI to training data in json format test_data_path: str local path or s3 URI to training data in json format scorer: str A str (see sklearn model evaluation documentation). Will optimize for this scorer. max_iter: int Maximum number of iterations to evaluate hyperparameters optimizer: str Optimizer to use. One of "rand" or "suggest". See `TrainingSchema.OPTIMIZER_CHOICES`. test_metrics: List of str (or Callable) List of scorers to report metrics for model's performance on the test data set. See `scorer`. n_folds: int, default=5 Number of folds over which to compute training metrics during hyperparameter optimization. seed: int, default=42 Random seed refit: bool, default=True Whether to refit the model on the train and test set combined Returns ------- 4-tuple of: Pipeline the trained model, in a Pipeline object Optimized metric score (cross-validated) A tuple of (<scorer_name>, <scores>). The scores are for each fold in the training data. Test metrics A dictionary of score_name: score_value. These scores are computed on the test set only. confusion matrix A confusion matrix in dictionary format with the following keys: "TN", "TP", "FN", "FP" (corresponding to "true negative", "true positive", "false negative", "false positive", respectively). Notes ----- When training a model for inference in the Allen Institute's production ophys pipeline, the data fed to FeatureExtractor should match between here and the production pipeline inference using the model: https://github.com/AllenInstitute/ophys_etl_pipelines/blob/37a03ec8d944b688c75da73e201824627d7f7df9/src/ophys_etl/transforms/classification.py#L426-L439 # noqa One consideration is that the production inference currently downsamples traces from 31Hz to 4Hz. Another consideration is the format. This training is written to use the FeatureExtractor.from_list_of_dict() method, while production inference uses the default constructor. """ logger.info("Reading training data and extracting features.") training_data = json_load_local_or_s3(training_data_path) test_data = json_load_local_or_s3(test_data_path) features = FeatureExtractor.from_list_of_dict(training_data).run() test_features = FeatureExtractor.from_list_of_dict(test_data).run() labels = [r["label"] for r in training_data] test_labels = [r["label"] for r in test_data] # Instantiate model and (static) preprocessing pipeline pipeline = feature_pipeline(drop_cols=drop_cols) tuner_cls = TrainingSchema.MODEL_CHOICES[model] optimizer = TrainingSchema.OPTIMIZER_CHOICES[optimizer] tuner = tuner_cls(features, labels, pipeline, n_splits=n_folds, scorer=scorer, opt_algo=optimizer, seed=seed) # Tune with CV and fit to training data logger.info(f"Fitting {model} to data (n={len(features)}).") start_time = time.time() logger.info(f"Optimizing '{scorer}' over {max_iter} iterations...") best_params = tuner.fmax(max_iter) end_time = time.time() logger.info(f"Considered {max_iter} trials over " f"{end_time-start_time} seconds. " f"Best Score: {-tuner.trials.best_trial['result']['loss']}\n" f"Best Params: {best_params}") clf = tuner.classifier(**best_params) pipeline.steps.append(("classifier", clf)) logger.info(f"Fitting {model} model to training data with {best_params}") pipeline.fit(features, labels) # Compute cross-validation score for optimization metric # Note that this metric will be an optimistic estimate of performance, # since we are not using nested cross-validation cv = KFold(n_splits=n_folds, shuffle=True, random_state=seed) opt_score = (scorer, cross_val_score(pipeline, features, labels, scoring=scorer, cv=cv)) # Compute metrics for test data # Default to just computing the optimization metric on test data test_metrics = test_metrics or [scorer] test_metrics = { f"test_{metric}": get_scorer(metric)(pipeline, test_features, test_labels) for metric in test_metrics } # Compute confusion matrix for arbitrary metric calculation cmat = _binary_confusion_dict(test_labels, pipeline.predict(test_features), "test_") # Refit on full data if applicable if refit: full_features = pd.concat([features, test_features], axis=0) full_labels = labels + test_labels pipeline.fit(full_features, full_labels) return pipeline, opt_score, test_metrics, cmat
def test_area(roi, expected): assert fx._feat_area(roi) == expected
def test_ellipticalness(roi, expected): assert fx._feat_ellipticalness(roi) == expected
def test_feat_compactness(roi, expected): assert fx._feat_compactness(roi) == expected
def test_feat_max_to_avg_f_ratio(trace, expected): assert fx._feat_max_to_avg_f_ratio(trace) == expected assert fx._feat_max_to_avg_f_minus_np_ratio(trace) == expected
def test_apply_fns(xs, fns, expected): assert expected == fx._apply_functions(xs, *fns)
def test_neighborhood_info(roi, expected): actual = fx._neighborhood_info(roi) for ix, val in enumerate(expected): assert val == actual[ix]
def test_roi_width(roi, expected): assert expected == fx._feat_roi_width(roi)
def test_roi_height(roi, expected): assert expected == fx._feat_roi_height(roi)
def test_last_tenth_trace_skew(trace, expected): assert expected == fx._feat_last_tenth_trace_skew(trace)