def main(parser):

    with open(parser.args["roi_masks_path"], "r") as f:
        raw_roi_data = json.load(f)

    roi_data = SparseAndDenseROISchema(many=True).load(raw_roi_data)
    roi_data, excluded_rois = filter_excluded_rois(roi_data)
    rois, metadata, traces, _ = _munge_data(parser, roi_data)
    # TODO: add neuropil traces later
    logger.info(f"Extracting features from '{len(rois)}' ROIs")
    features = FeatureExtractor(rois, traces, metadata).run()
    logger.info(f"Using the following classifier model: "
                f"{parser.args['classifier_model_path']}")
    model = load_model(parser.args["classifier_model_path"])
    logger.info("Classifying ROIs with features")
    predictions = model.predict(features)
    if len(predictions) != len(roi_data):
        raise ValueError(
            f"Expected the number of predictions ({len(predictions)}) to  "
            f"equal the number of input ROIs ({len(roi_data)}), but they "
            "are not the same.")
    for obj, prediction in zip(roi_data, predictions):
        if prediction == 0:
            obj["exclusion_labels"].append(NOT_CELL_EXCLUSION_LABEL)
            obj["valid_roi"] = False

    roi_data.extend(excluded_rois)

    output_data = {
        "classified_rois": roi_data,
        "classifier_model_path": parser.args["classifier_model_path"]
    }
    parser.output(output_data)
    logger.info("ROI classification successfully completed!")
예제 #2
0
def test_feat_fns_by_source(sources, expected, function_class):
    my_class = function_class()
    actual = fx._feat_fns_by_source(my_class, sources=sources)
    # Pytest doesn't have nice stuff for dicts... much less dicts with lists
    for k, vals in expected.items():
        assert k in actual
        assert len(actual[k]) == len(vals)
        for v in vals:
            assert v in actual[k]
예제 #3
0
def test_train_classifier_binary_preds(model, train_data, test_data, drop_cols,
                                       tmp_path):
    """tests that `train_classifier()` generates a classifier that
    makes binary predictions.
    """
    clf, _, _, _ = train.train_classifier(model=model,
                                          training_data_path=train_data,
                                          test_data_path=test_data,
                                          scorer='roc_auc',
                                          max_iter=2,
                                          optimizer="rand",
                                          n_folds=2,
                                          refit=True,
                                          drop_cols=drop_cols)

    # load testing roi and generate input features
    with open(test_data, 'r') as open_testing:
        testing_data = json.load(open_testing)
    features = FeatureExtractor.from_list_of_dict(testing_data).run()

    predictions = clf.predict(features)
    assert set(predictions).issubset({0, 1})
예제 #4
0
def train_classifier(model: str,
                     training_data_path: str,
                     test_data_path: str,
                     scorer: Union[str, Callable],
                     max_iter: int,
                     optimizer: str,
                     test_metrics: Optional[List[str]] = None,
                     n_folds: int = 5,
                     seed: int = 42,
                     refit: bool = True,
                     drop_cols: List[str] = None):
    """Tunes and trains a model using hyperopt to optimize
    hyperparameters. Internally uses k-fold cross validation to
    compute optimization metrics. Uses `feature_pipeline` to
    preprocess data. The trained and tuned classifier is appended
    onto the pipeline as the final step.

    Parameters
    ----------
    model: str
        The model algorithm to use. One of "RandomForestClasifier"
        or "LogisticRegression". See `TrainingSchema.MODEL_CHOICES`.
    training_data_path: str
        local path or s3 URI to training data in json format
    test_data_path: str
        local path or s3 URI to training data in json format
    scorer: str
        A str (see sklearn model evaluation documentation).
        Will optimize for this scorer.
    max_iter: int
        Maximum number of iterations to evaluate hyperparameters
    optimizer: str
        Optimizer to use. One of "rand" or "suggest". See
        `TrainingSchema.OPTIMIZER_CHOICES`.
    test_metrics: List of str (or Callable)
        List of scorers to report metrics for model's performance on
        the test data set. See `scorer`.
    n_folds: int, default=5
        Number of folds over which to compute training metrics during
        hyperparameter optimization.
    seed: int, default=42
        Random seed
    refit: bool, default=True
        Whether to refit the model on the train and test set combined

    Returns
    -------
    4-tuple of:
        Pipeline
            the trained model, in a Pipeline object
        Optimized metric score (cross-validated)
            A tuple of (<scorer_name>, <scores>). The scores are for
            each fold in the training data.
        Test metrics
            A dictionary of score_name: score_value. These scores
            are computed on the test set only.
        confusion matrix
            A confusion matrix in dictionary format with the following
            keys: "TN", "TP", "FN", "FP" (corresponding to
            "true negative", "true positive", "false negative",
            "false positive", respectively).

    Notes
    -----
    When training a model for inference in the Allen Institute's production
    ophys pipeline, the data fed to FeatureExtractor should match between
    here and the production pipeline inference using the model:
    https://github.com/AllenInstitute/ophys_etl_pipelines/blob/37a03ec8d944b688c75da73e201824627d7f7df9/src/ophys_etl/transforms/classification.py#L426-L439  # noqa
    One consideration is that the production inference currently downsamples
    traces from 31Hz to 4Hz.
    Another consideration is the format. This training is written to use
    the FeatureExtractor.from_list_of_dict() method, while production
    inference uses the default constructor.

    """
    logger.info("Reading training data and extracting features.")
    training_data = json_load_local_or_s3(training_data_path)
    test_data = json_load_local_or_s3(test_data_path)
    features = FeatureExtractor.from_list_of_dict(training_data).run()
    test_features = FeatureExtractor.from_list_of_dict(test_data).run()
    labels = [r["label"] for r in training_data]
    test_labels = [r["label"] for r in test_data]

    # Instantiate model and (static) preprocessing pipeline
    pipeline = feature_pipeline(drop_cols=drop_cols)
    tuner_cls = TrainingSchema.MODEL_CHOICES[model]
    optimizer = TrainingSchema.OPTIMIZER_CHOICES[optimizer]
    tuner = tuner_cls(features,
                      labels,
                      pipeline,
                      n_splits=n_folds,
                      scorer=scorer,
                      opt_algo=optimizer,
                      seed=seed)

    # Tune with CV and fit to training data
    logger.info(f"Fitting {model} to data (n={len(features)}).")
    start_time = time.time()
    logger.info(f"Optimizing '{scorer}' over {max_iter} iterations...")
    best_params = tuner.fmax(max_iter)
    end_time = time.time()
    logger.info(f"Considered {max_iter} trials over "
                f"{end_time-start_time} seconds. "
                f"Best Score: {-tuner.trials.best_trial['result']['loss']}\n"
                f"Best Params: {best_params}")
    clf = tuner.classifier(**best_params)
    pipeline.steps.append(("classifier", clf))
    logger.info(f"Fitting {model} model to training data with {best_params}")
    pipeline.fit(features, labels)

    # Compute cross-validation score for optimization metric
    # Note that this metric will be an optimistic estimate of performance,
    # since we are not using nested cross-validation
    cv = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    opt_score = (scorer,
                 cross_val_score(pipeline,
                                 features,
                                 labels,
                                 scoring=scorer,
                                 cv=cv))

    # Compute metrics for test data
    # Default to just computing the optimization metric on test data
    test_metrics = test_metrics or [scorer]
    test_metrics = {
        f"test_{metric}": get_scorer(metric)(pipeline, test_features,
                                             test_labels)
        for metric in test_metrics
    }

    # Compute confusion matrix for arbitrary metric calculation
    cmat = _binary_confusion_dict(test_labels, pipeline.predict(test_features),
                                  "test_")

    # Refit on full data if applicable
    if refit:
        full_features = pd.concat([features, test_features], axis=0)
        full_labels = labels + test_labels
        pipeline.fit(full_features, full_labels)
    return pipeline, opt_score, test_metrics, cmat
예제 #5
0
def test_area(roi, expected):
    assert fx._feat_area(roi) == expected
예제 #6
0
def test_ellipticalness(roi, expected):
    assert fx._feat_ellipticalness(roi) == expected
예제 #7
0
def test_feat_compactness(roi, expected):
    assert fx._feat_compactness(roi) == expected
예제 #8
0
def test_feat_max_to_avg_f_ratio(trace, expected):
    assert fx._feat_max_to_avg_f_ratio(trace) == expected
    assert fx._feat_max_to_avg_f_minus_np_ratio(trace) == expected
예제 #9
0
def test_apply_fns(xs, fns, expected):
    assert expected == fx._apply_functions(xs, *fns)
예제 #10
0
def test_neighborhood_info(roi, expected):
    actual = fx._neighborhood_info(roi)
    for ix, val in enumerate(expected):
        assert val == actual[ix]
예제 #11
0
def test_roi_width(roi, expected):
    assert expected == fx._feat_roi_width(roi)
예제 #12
0
def test_roi_height(roi, expected):
    assert expected == fx._feat_roi_height(roi)
예제 #13
0
def test_last_tenth_trace_skew(trace, expected):
    assert expected == fx._feat_last_tenth_trace_skew(trace)