예제 #1
0
def fit_cv(X,
           y,
           nepochs=100,
           batch_size=10,
           val_pct=0.1,
           verbose=False,
           lr=3e-4,
           weight_decay=0.01,
           model_type='nonlinear',
           nfolds=5):
    folds = create_folds(X, nfolds)
    models = []
    for fold_idx, fold in enumerate(folds):
        print('Fitting model {}'.format(fold_idx))
        mask = np.ones(X.shape[0], dtype=bool)
        mask[fold] = False
        models.append(
            fit_nn(X[mask],
                   y[mask],
                   nepochs=nepochs,
                   batch_size=batch_size,
                   val_pct=val_pct,
                   verbose=verbose,
                   lr=lr,
                   weight_decay=weight_decay,
                   model_type=model_type))
    return NeuralCvModel(models, folds)
예제 #2
0
def load_plx4720(verbose=False, feature_type="mutation"):
    drug_target = "PLX4720"
    if verbose:
        print("Loading data")
    X_drugs, y_drugs, drugs, cells, features = load_ccle(
        feature_type=feature_type, drug_target=drug_target, normalize=True
    )
    drug_idx = drugs.get_loc(drug_target)
    if verbose:
        print("Drug {}".format(drugs[drug_idx]))
    X_drug, y_drug = X_drugs[drug_idx], y_drugs[drug_idx]

    ######## Specific to PLX4720. Filters out all features with pearson correlation less than 0.1 in magnitude ########
    ccle_expected_features = [
        "C11orf85",
        "FXYD4",
        "SLC28A2",
        "MAML3_MUT",
        "RAD51L1_MUT",
        "GAPDHS",
        "BRAF_MUT",
    ]
    if verbose:
        print("Filtering by correlation with signal first")
    ccle_selected, corrs = ccle_feature_filter(X_drug, y_drug)
    for plx4720_feat in [f for f in ccle_expected_features if f in features]:
        idx = features.get_loc(plx4720_feat)
        ccle_selected[idx] = True
        if verbose:
            print("Correlation for {}: {:.4f}".format(plx4720_feat, corrs[idx]))
    ccle_features = features[ccle_selected]

    # Split the data into 10 folds where each fold contains at least 1 observation of each drug
    nfolds = 10
    drug_folds = [create_folds(x_i, nfolds) for x_i in X_drugs]
    X_drug, y_drug, folds = X_drugs[drug_idx], y_drugs[drug_idx], drug_folds[drug_idx]

    # Load or fit the model
    MODEL_PATH = "data/model.pt"
    if os.path.exists(MODEL_PATH):
        elastic_model = torch.load(MODEL_PATH)
    else:
        elastic_model = CvModel(
            fit_cv(X_drug, y_drug, folds, fit_elastic_net_ccle, selected=ccle_selected),
            folds,
            "Elastic Net",
            selected=ccle_selected,
        )
        torch.save(elastic_model, MODEL_PATH)

        # Plot the fit to show it's pretty good
        # plot_ccle_predictions(elastic_model, X_drug, y_drug)

        # Show the features selected by the heuristic if we fit this way (may be slightly different than in the paper)
        if verbose:
            print_top_features(elastic_model, ccle_expected_features)

    return X_drug, y_drug, features, ccle_features, elastic_model
예제 #3
0
def load_or_fit_model(descriptor, X, Y):
    nfolds = 10
    y = Y[descriptor]
    y = y[y.notnull()]
    x = X.loc[y.index].values
    y = y.values

    model_path = 'data/{}.pt'.format(descriptor)
    if os.path.exists(model_path):
        forest_model = joblib.load(model_path)
    else:
        print('Fitting {}'.format(descriptor))
        folds = create_folds(x, nfolds)
        if descriptor == 'Intensity':
            forest_model = CvModel(fit_cv(x, y, folds, fit_extratrees), folds,
                                   'ExtraTrees')
        else:
            forest_model = CvModel(fit_cv(x, y, folds, fit_forest), folds,
                                   'RandomForest')

        plot_predictions(forest_model, x, y, descriptor)
        joblib.dump(forest_model, model_path)

    return x, y, forest_model
예제 #4
0
def calibrate_discrete(
    X,
    feature,
    X_test=None,
    nquantiles=101,
    nbootstraps=100,
    nfolds=5,
    tv_threshold=0.005,
    p_threshold=0.0,
    use_cv=False,
):
    """Calibrates a bootstrap confidence interval conditional model for a given feature."""
    classes = np.unique(X[:, feature])
    nclasses = len(classes)

    # Search over a linear quantile grid to search
    quantile_range = np.linspace(0, 100, nquantiles)

    jmask = np.ones(X.shape[1], dtype=bool)
    jmask[feature] = False
    if X_test is None and use_cv:
        # Use k-fold cross-validation to generate conditional probability estimates for X_j
        print("Fitting using {} bootstrap resamples and {} folds".format(
            nbootstraps, nfolds))
        probs = np.zeros((nquantiles, X.shape[0], nclasses))
        proposals = []
        folds = create_folds(X, nfolds)
        for fold_idx, fold in enumerate(folds):
            print(fold_idx)
            imask = np.ones(X.shape[0], dtype=bool)
            imask[fold] = False
            model = DiscreteBootstrapConditionalModel(
                X[imask][:, jmask],
                X[imask][:, feature],
                fit_classifier,
                nbootstraps=nbootstraps,
            )
            # probs[:,fold] = model.pmf_quantiles(X[fold][:,jmask], X[fold][:,feature], quantile_range, axis=0)
            for c in classes:
                probs[:, fold, c] = model.pmf_quantiles(X[fold][:, jmask],
                                                        c,
                                                        quantile_range,
                                                        axis=0)
            proposals.append(model)
        # sampler = lambda l, u: sample_cv(X[:,jmask], proposals, folds, l, u)
        sampler = CrossValidationSampler(X[:, jmask], proposals, folds)
        outcomes = np.array([(X[:, feature] == c).mean() for c in classes])
    else:
        if X_test is None:
            print("Using training set as testing set.")
            X_test = X
        # Use a held-out test set
        print(
            "Fitting using {} bootstrap resamples and a {}/{} train/test split"
            .format(nbootstraps, X.shape[0], X_test.shape[0]))
        model = DiscreteBootstrapConditionalModel(X[:, jmask],
                                                  X[:, feature],
                                                  fit_classifier,
                                                  nbootstraps=nbootstraps)
        probs = np.zeros((nquantiles, X_test.shape[0], nclasses))
        for cidx, c in enumerate(classes):
            probs[:, :, cidx] = model.pmf_quantiles(X_test[:, jmask],
                                                    c,
                                                    quantile_range,
                                                    axis=0)
        # sampler = lambda l, u: sample_holdout(X_test[:,jmask], model, l, u)
        sampler = HoldoutSampler(X_test[:, jmask], model)
        outcomes = np.array([(X_test[:, feature] == c).mean()
                             for c in classes])

    # Find the lower quantile that forms a sufficient lower bound on the observed probabilities
    for i in range(1, nquantiles // 2):
        lower = quantile_range[nquantiles // 2 - i]
        qlower = probs[nquantiles // 2 - i]
        tv_lower = (qlower.mean(axis=0) - outcomes).clip(0, np.inf).sum()
        tv_pvalue = tv_test(tv_lower, outcomes, probs.shape[1])
        # print('Lower: {} TV: {} p: {}'.format(lower, tv_lower, tv_pvalue))

        # Allow some error tolerance due to noise/finite data
        if tv_lower <= tv_threshold or tv_pvalue <= p_threshold:
            break

    # Find the upper quantile
    for i in range(1, nquantiles // 2):
        upper = quantile_range[nquantiles // 2 + i]
        qupper = probs[nquantiles // 2 + i]
        tv_upper = (outcomes - qupper.mean(axis=0)).clip(0, np.inf).sum()

        tv_pvalue = tv_test(tv_upper, outcomes, probs.shape[1])
        # print('Upper: {} TV: {} p: {}'.format(upper, tv_upper, tv_pvalue))

        # Allow some error tolerance due to noise/finite data
        if tv_upper <= tv_threshold or tv_pvalue <= p_threshold:
            break

    # Our TV-distance is the worst-case of the two bounds
    tv_stat = np.max([tv_lower, tv_upper])
    sampler.quantiles = np.array([lower, upper])

    # TODO: how do we get a p-value estimate here? no clear notion of null...

    print("Selected intervals: [{},{}]".format(lower, upper))

    return {
        "model": model,
        "probs": probs,
        "tv_stat": tv_stat,
        "upper": upper,
        "lower": lower,
        "qupper": qupper,
        "qlower": qlower,
        "quantiles": quantile_range,
        "sampler": sampler,
    }
예제 #5
0
def calibrate_continuous(X, feature,
                         X_test=None, nquantiles=101, nbootstraps=100,
                         nfolds=5, ks_threshold=0.005, p_threshold=0.,
                         use_cv=False):
    '''Calibrates a bootstrap confidence interval conditional model for a given feature.'''
    # Search over a linear quantile grid to search
    quantile_range = np.linspace(0, 100, nquantiles)

    jmask = np.ones(X.shape[1], dtype=bool)
    jmask[feature] = False
    if X_test is None and use_cv:
        # Use k-fold cross-validation to generate conditional density estimates for X_j
        print('Fitting using {} bootstrap resamples and {} folds'.format(nbootstraps, nfolds))
        cdfs = np.zeros((nquantiles, X.shape[0]))
        proposals = []
        folds = create_folds(X, nfolds)
        for fold_idx, fold in enumerate(folds):
            imask = np.ones(X.shape[0], dtype=bool)
            imask[fold] = False
            model = BootstrapConditionalModel(X[imask][:,jmask], X[imask][:,feature], fit_mdn, nbootstraps=nbootstraps)
            cdfs[:,fold] = model.cdf_quantiles(X[fold][:,jmask], X[fold][:,feature], quantile_range, axis=0)
            proposals.append(model)
        sampler = CrossValidationSampler(X[:,jmask], proposals, folds)
    else:
        if X_test is None:
            print('Using training set as testing set.')
            X_test = X
        # Use a held-out test set
        print('Fitting using {} bootstrap resamples and a {}/{} train/test split'.format(nbootstraps, X.shape[0], X_test.shape[0]))
        model = BootstrapConditionalModel(X[:,jmask], X[:,feature], fit_mdn, nbootstraps=nbootstraps)
        cdfs = model.cdf_quantiles(X_test[:,jmask], X_test[:,feature], quantile_range, axis=0)
        sampler = HoldoutSampler(X_test[:,jmask], model)

    # Look at the bounds of the CDF along a discrete grid of points
    ks_grid = np.linspace(1e-6,1-1e-6,1001)

    # Find the lower quantile that forms a sufficient upper bound on the uniform CDF
    for i in range(1,nquantiles//2):
        lower = quantile_range[nquantiles//2 - i]
        qlower = cdfs[nquantiles//2 - i]
        
        # U(0,1) CDF is the (0,1),(0,1) line. So at every point q on the grid of
        # CDF points, we expect a well-calibrated model to have q*N points with
        # CDF value lower than q. Here we are looking for an upper bound, so
        # we measure the KS distance as the maximum amount the U(0,1) CDF is
        # above the predicted CDF.
        ks_lower = 0
        for ks_point in ks_grid:
            ks_lower = max(ks_lower, ks_point - (qlower <= ks_point).mean())

        ks_pvalue = ks_test(ks_lower, cdfs.shape[1])
        # print('Lower: {} KS: {} p: {}'.format(lower, ks_lower, ks_pvalue))

        # Allow some error tolerance due to noise/finite data
        if ks_lower <= ks_threshold or ks_pvalue <= p_threshold:
            break

    # Find the upper quantile
    for i in range(1,nquantiles//2):
        upper = quantile_range[nquantiles//2+i]
        qupper = cdfs[nquantiles//2 + i]

        # U(0,1) CDF is the (0,1),(0,1) line. So at every point q on the grid of
        # CDF points, we expect a well-calibrated model to have q*N points with
        # CDF value lower than q. Here we are looking for a lower bound, so
        # we measure the KS distance as the maximum amount the U(0,1) CDF is
        # below the predicted CDF.
        ks_upper = 0
        for ks_point in ks_grid:
            ks_upper = max(ks_upper, (qupper <= ks_point).mean() - ks_point)

        ks_pvalue = ks_test(ks_upper, cdfs.shape[1])
        # print('Upper: {} KS: {} p: {}'.format(upper, ks_upper, ks_pvalue))

        # Allow some error tolerance due to noise/finite data
        if ks_upper <= ks_threshold or ks_pvalue <= p_threshold:
            break
        

    # Set the sampler to the chosen regions
    sampler.quantiles = np.array([lower, upper])

    # Our KS-distance is the worst-case of the two bounds
    ks_stat = np.max([ks_lower, ks_upper])

    # The p-value on the KS test that the bounded distribution is different
    # from the Uniform distribution
    ks_pvalue = ks_test(ks_stat, cdfs.shape[1])

    print('Selected intervals: [{},{}]'.format(lower, upper))

    return {'model': model,
            'cdfs': cdfs,
            'ks_stat': ks_stat,
            'ks_pvalue': ks_pvalue,
            'upper': upper,
            'lower': lower,
            'qupper': qupper,
            'qlower': qlower,
            'quantiles': quantile_range,
            'sampler': sampler
            }
예제 #6
0
def run(trial, feature, reset=False):
    N = 500  # total number of samples
    P = 500  # number of features
    S = 40  # number of signal features
    nperms = 5000
    fdr_threshold = 0.1
    nfolds = 5

    X, y, truth = load_or_create_dataset(trial, N, P, S)

    np.random.seed(trial * P + feature)

    infos = [
        ModelInfo(trial, "Partial Least Squares", fit_pls, "pls"),
        ModelInfo(trial, "Lasso", fit_lasso_cv, "lasso"),
        ModelInfo(trial, "Elastic Net", fit_elastic_net_cv, "enet"),
        ModelInfo(trial, "Bayesian Ridge", fit_bridge, "bridge"),
        ModelInfo(trial, "Polynomial Kernel Ridge", fit_kridge, "kridge"),
        ModelInfo(trial, "RBF Support Vector", fit_svr, "svr"),
        ModelInfo(trial, "Random Forest", fit_forest, "rf")
        # ModelInfo(trial, 'Extra Trees', fit_extratrees, 'xtrees')
    ]

    folds = get_model(infos[0], X, y, create_folds(X, nfolds), reset).folds
    models = [get_model(info, X, y, folds, reset) for info in infos]

    # Create the test statistic for each model
    # tstats = [(lambda X_target: ((y - model.predict(X_target))**2).mean()) for model in models]

    # Load the conditional model for this feature
    conditional = get_conditional(trial, feature)

    # Run the normal CVRT for the first model, but save the null samples to
    # avoid recomputing them for the rest of the models.
    info, model = infos[0], models[0]
    tstat = lambda X_target: ((y - model.predict(X_target))**2).mean()
    print("Running CVRT for {}".format(info.name))
    results = hrt(
        feature,
        tstat,
        X,
        nperms=nperms,
        conditional=conditional,
        lower=conditional.quantiles[0],
        upper=conditional.quantiles[1],
        save_nulls=True,
    )
    p_value = results["p_value"]
    print("p={}".format(p_value))
    np.save("data/{}/{}_{}.npy".format(trial, info.prefix, feature), p_value)

    # Get the relevant values from the full CVRT on the first model
    t_true = results["t_stat"]
    X_nulls = results["samples_null"]
    quantile_nulls = results["quantiles_null"]

    # Run the CVRTs for the remaining models using the same null samples
    X_null = np.copy(X)
    for info, model in zip(infos[1:], models[1:]):
        print("Running cached CVRT for {}".format(info.name))
        t_weights = np.full(nperms, np.nan)
        t_null = np.full(nperms, np.nan)
        tstat = lambda X_target: ((y - model.predict(X_target))**2).mean()
        t_true = tstat(X)
        for perm in range(nperms):
            if (perm % 500) == 0:
                print("Trial {}".format(perm))

            # Get the test-statistic under the null
            X_null[:, feature] = X_nulls[perm]
            t_null[perm] = tstat(X_null)
            if t_null[perm] <= t_true:
                # Over-estimate the likelihood
                t_weights[perm] = quantile_nulls[perm, 1]
            else:
                # Under-estimate the likelihood
                t_weights[perm] = quantile_nulls[perm, 0]

        p_value = t_weights[t_null <= t_true].sum() / t_weights.sum()
        print("p={}".format(p_value))
        np.save("data/{}/{}_{}.npy".format(trial, info.prefix, feature),
                p_value)