def main():
    param_space = {
        'C': [1e-4, 1, 1e4],
        'gamma': [1e-3, 1, 1e3],
        'class_weight': [None, 'balanced']
    }

    model = SVC(kernel='rbf')

    digits = load_digits()

    X_train, X_test, y_train, y_test = tts(digits.data,
                                           digits.target,
                                           test_size=0.3)

    print("Starting local cluster")
    cluster = LocalCluster()
    client = Client(cluster)
    print(client)

    print("Start searching")
    search = GridSearchCV(model, param_space, cv=3)
    search.fit(X_train, y_train)

    print("Prepare report")
    print(
        classification_report(y_true=y_test,
                              y_pred=search.best_estimator_.predict(X_test)))
Exemplo n.º 2
0
def getprobabilities(X, y, p_grid, cv, ac_sens):
    """getprobabilities(X,y,p_grid, cv, ac_sens)
    
    - X and y: Inputs and outputs
    - p_grid: grid of parameters to search over
    - cv: Number of cross-validation folds (this is different from the outer number of x-val folds)
    
    Gets the probability of picking each of the options provided by p_grid given the data in X and y.
    The algorithm is as follows:
     - Find the sensitivity of the SSE for each parameter-values
     - Find the SSE of each parameter-values
     - Find the probability of selecting those parameter-values
    """
    kern = GPy.kern.RBF(2.0, lengthscale=25.0, variance=1.0)
    errorlimit = ac_sens * 4.0

    ####find sensitivity of the SSE (for each param combo)
    #this call gets the sensivities not the scores:
    #TODO This probably should be done locally as it's quick.
    clf = GridSearchCV(estimator=DPCloaking(sensitivity=ac_sens,
                                            inducing=4,
                                            getxvalfoldsensitivities=True,
                                            kern=kern,
                                            errorlimit=errorlimit),
                       param_grid=p_grid,
                       cv=cv)
    clf.fit(X, y)

    nparamcombos = len(clf.cv_results_['mean_test_score'])
    temp_sens = np.zeros([clf.cv, nparamcombos])
    for k in range(clf.cv):
        temp_sens[k, :] = clf.cv_results_['split%d_test_score' % k]
    #sensitivity of the sum squared error:
    print(np.sort(temp_sens, axis=0))
    sse_sens = ac_sens**2 + 2 * ac_sens * errorlimit + ac_sens**2 * np.max(
        np.sum(np.sort(temp_sens, axis=0)[0:clf.cv - 1, :], 0))

    ####find the SSE (for each param combo)
    clf = GridSearchCV(estimator=DPCloaking(sensitivity=ac_sens,
                                            inducing=4,
                                            getxvalfoldsensitivities=False,
                                            kern=kern,
                                            errorlimit=errorlimit),
                       param_grid=p_grid,
                       cv=cv)
    clf.fit(X, y)

    nparamcombos = len(clf.cv_results_['mean_test_score'])
    temp_scores = np.zeros([clf.cv, nparamcombos])
    for k in range(clf.cv):
        temp_scores[k, :] = clf.cv_results_['split%d_test_score' % k]
    scores = np.sum(temp_scores, 0)

    ####compute the probability of selecting that param combo using the exponential mechanism
    selection_epsilon = 1
    param_probabilities = np.exp(selection_epsilon * scores / (2 * sse_sens))
    param_probabilities = param_probabilities / np.sum(param_probabilities)

    return param_probabilities
Exemplo n.º 3
0
def getscores(X, y, p_grid, cv, ac_sens):
    """
    Compute the negative RMSE of each of the fold/param combos
    """
    kern = GPy.kern.RBF(2.0, lengthscale=25.0, variance=1.0)

    clf = GridSearchCV(estimator=DPCloaking(sensitivity=ac_sens,
                                            inducing=4,
                                            getxvalfoldsensitivities=False,
                                            kern=kern),
                       scoring='neg_mean_squared_error',
                       param_grid=p_grid,
                       cv=cv)
    clf.fit(X, y)

    nparamcombos = len(clf.cv_results_['mean_test_score'])
    scores = np.zeros([clf.cv, nparamcombos])
    for k in range(clf.cv):
        scores[k, :] = clf.cv_results_['split%d_test_score' % k]
    return scores
Exemplo n.º 4
0
def main():

    param_space = {
        "C": [1e-4, 1, 1e4],
        "gamma": [1e-3, 1, 1e3],
        "class_weight": [None, "balanced"],
        "kernel": ["linear", "poly", "rbf", "sigmoid"]
    }
    model = SVC()

    #     param_space = {"n_estimators":[100, 200, 300, 400, 500],
    #                                    "criterion":["gini", "entropy"],
    #                                    "max_features":["auto", "sqrt", "log2"],
    #                                    "max_depth":[2, 3, 4, 5, 6, 7, 8]}
    #     model = RandomForestClassifier()

    digits = load_digits()

    #     classifier = GridSearchCV(model, param_space, n_jobs=-1, cv=5)

    classifier = GridSearchCV(model, param_space, n_jobs=-1, cv=5)

    classifier.fit(digits.data, digits.target)

    print("Grid Scores:")
    means = classifier.cv_results_["mean_test_score"]
    standard_deviations = classifier.cv_results_["std_test_score"]
    for mean, standard_deviation, parameter in zip(
            means, standard_deviations, classifier.cv_results_["params"]):
        print("%0.3f (+/-%0.03f) for %r" %
              (mean, standard_deviation * 2, parameter))
    print()

    print("Best Score: %0.3f" % (classifier.best_score_))
    print()
    print("Best Parameters:")
    print(classifier.best_params_)
    print()
def evaluate_classifier(X,
                        y,
                        list_of_queries,
                        set_k_range, k_function,
                        alpha_range, 
                        l1_ratio):
    
    ''' Run a classifier setup on a set of queries.
    
        Loop through each query; train and test the classifier using the
        hyperparameters input as parameters; populate the metrics dictionary
        with some metrics of which parameters were selected and how well
        the classifier did for that query.
    '''
    
    # A dictionary to hold the performance metrics.
    metrics_dict = {}
    
    # Loop through each query; train and test the classifer; populate the metrics dictionary.
    for query in list_of_queries:
        num_samples = query[2]['total']
        num_positives = query[2]['positive']
        
        # Subset by gene.
        y_query = y[query[0]]
        # Subset by diseases.
        disease_cols = [col for col in covariates.columns if col.endswith(tuple(query[1]))]
        has_disease = covariates[disease_cols].max(axis=1) > 0
        covariates_query = covariates[has_disease]
        X_query = X[X.index.isin(covariates_query.index)]
        y_query = y_query[y_query.index.isin(covariates_query.index)]
                
        # Test Train split
        test_size = 0.2
        X_train, X_test, y_train, y_test = train_test_split(X_query, y_query, stratify=y_query, test_size=test_size, random_state=RANDOMSEED)
        # PCA.
        scaler = StandardScaler()
        if query[2]['total']*(1-test_size)*(1-(1/3)) > 350:
            n_comp = 350
        else:
            n_comp = int(query[2]['total']*(1-test_size) - 1)
        pca = PCA(n_components = n_comp, random_state = RANDOMSEED)
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        pca.fit(X_train_scaled)
        X_train = pca.transform(X_train_scaled)
        X_test_scaled = scaler.transform(X_test)
        X_test = pca.transform(X_test_scaled)
        
        if set_k_range:
            k_range = set_k_range
        else:
            k_range = k_function(num_samples=num_samples,
                                 num_positives=num_positives,
                                 )     
        # Parameter Sweep for Hyperparameters
        param_grid = {
            'select__k': k_range,
            'classify__loss': ['log'],
            'classify__penalty': ['elasticnet'],
            'classify__alpha': alpha_range,
            'classify__l1_ratio': l1_ratio,
        }
        pipeline = Pipeline(steps=[
            ('select', SelectKBest(variance_scorer)),
            ('classify', SGDClassifier(random_state=RANDOMSEED, class_weight='balanced'))
        ])
        cv_pipeline = GridSearchCV(estimator=pipeline, 
                                   param_grid=param_grid,
                                   n_jobs=1, 
                                   scoring='roc_auc')
        cv_pipeline.fit(X=X_train, y=y_train)
        y_pred_train = cv_pipeline.decision_function(X_train)
        y_pred_test = cv_pipeline.decision_function(X_test)
        # Get ROC info.
        def get_threshold_metrics(y_true, y_pred):
            roc_columns = ['fpr', 'tpr', 'threshold']
            roc_items = zip(roc_columns, roc_curve(y_true, y_pred))
            roc_df = pd.DataFrame.from_items(roc_items)
            auroc = roc_auc_score(y_true, y_pred)
            return {'auroc': auroc, 'roc_df': roc_df}
        metrics_train = get_threshold_metrics(y_train, y_pred_train)
        metrics_test = get_threshold_metrics(y_test, y_pred_test)

        # Populate the metrics dictionary.
        # Get metrics for the classifier.
        overfit = metrics_train['auroc'] - metrics_test['auroc']
        # Understand how the parameter grid worked... any params at the edge?
        if cv_pipeline.best_params_['select__k'] == min(param_grid['select__k']):
            n_comp_status = 'min'
        elif cv_pipeline.best_params_['select__k'] == max(param_grid['select__k']):
            n_comp_status = 'max'
        else:
            n_comp_status = 'OK'
        if cv_pipeline.best_params_['classify__alpha'] == min(param_grid['classify__alpha']):
            alpha_status = 'min'
        elif cv_pipeline.best_params_['classify__alpha'] == max(param_grid['classify__alpha']):
            alpha_status = 'max'
        else:
            alpha_status = 'OK'
        metrics = {'num_samples': num_samples,
                   'num_positive': num_positives,
                   'balance': num_positives/num_samples,
                   'train_auroc': metrics_train['auroc'], 
                   'test_auroc': metrics_test['auroc'],
                   'n_components': cv_pipeline.best_params_['select__k'], 
                   'alpha': cv_pipeline.best_params_['classify__alpha'],
                   'overfit': overfit,
                   'n_comp_status': n_comp_status,
                   'alpha_status': alpha_status
                  }
        # Add the metrics to the dictonary.
        metrics_dict[query[0]+str(query[2]['total'])] = metrics
    # Change the metrics dict into a formatted pandas dataframe.
    metrics_df = pd.DataFrame(metrics_dict)
    metrics_df = metrics_df.T
    metrics_df.sort_values(by='num_positive', ascending=True, inplace=True)
    metrics_df = metrics_df[['num_samples', 'num_positive', 'balance', 'n_components','n_comp_status', 'alpha', 'alpha_status','train_auroc', 'test_auroc', 'overfit']]
    
    return(metrics_df)
Exemplo n.º 6
0
    'classify__loss': ['log'],
    'classify__penalty': ['elasticnet'],
    'classify__alpha': alphas,
    'classify__l1_ratio': l1_ratios
}

estimator = Pipeline(steps=[(
    'classify',
    SGDClassifier(random_state=0, class_weight='balanced', loss='log'))])

cv_pipeline = GridSearchCV(estimator=estimator,
                           param_grid=clf_parameters,
                           n_jobs=-1,
                           cv=folds,
                           scoring='roc_auc')
cv_pipeline.fit(X=x_train, y=y_train)

cv_results = pd.concat([
    pd.DataFrame(cv_pipeline.cv_results_).drop('params', axis=1),
    pd.DataFrame.from_records(cv_pipeline.cv_results_['params'])
],
                       axis=1)

# Cross-validated performance heatmap
cv_score_mat = pd.pivot_table(cv_results,
                              values='mean_test_score',
                              index='classify__l1_ratio',
                              columns='classify__alpha')
ax = sns.heatmap(cv_score_mat, annot=True, fmt='.1%')
ax.set_xlabel('Regularization strength multiplier (alpha)')
ax.set_ylabel('Elastic net mixing parameter (l1_ratio)')
def evaluate_classifier(X_train, X_test, y, y_train_allgenes, y_test_allgenes,
                        list_of_genes, set_k_range, k_function, alpha_range,
                        l1_ratio):
    ''' Run a classifier setup on a set of queries.
    
        Loop through each query; train and test the classifier using the
        hyperparameters input as parameters; populate the metrics dictionary
        with some metrics of which parameters were selected and how well
        the classifier did for that query.
    '''

    # A dictionary to hold the performance metrics.
    metrics_dict = {}

    # Loop through each query; train and test the classifer; populate the metrics dictionary.
    for gene in list_of_genes:

        # Train and test the classifier.

        y_gene = y[gene]
        y_train = y_train_allgenes[gene]
        y_test = y_test_allgenes[gene]
        num_positives = int(y_gene.value_counts(True)[1] * len(y_gene))
        if set_k_range:
            k_range = set_k_range
        else:
            k_range = k_function(num_positives)
        # Parameter Sweep for Hyperparameters
        param_grid = {
            'select__k': k_range,
            'classify__loss': ['log'],
            'classify__penalty': ['elasticnet'],
            'classify__alpha': alpha_range,
            'classify__l1_ratio': l1_ratio,
        }
        pipeline = Pipeline(steps=[('select', SelectKBest(variance_scorer)),
                                   ('classify',
                                    SGDClassifier(random_state=RANDOMSEED,
                                                  class_weight='balanced'))])
        cv_pipeline = GridSearchCV(estimator=pipeline,
                                   param_grid=param_grid,
                                   n_jobs=1,
                                   scoring='roc_auc')
        cv_pipeline.fit(X=X_train, y=y_train)
        y_pred_train = cv_pipeline.decision_function(X_train)
        y_pred_test = cv_pipeline.decision_function(X_test)

        # Get ROC info.
        def get_threshold_metrics(y_true, y_pred):
            roc_columns = ['fpr', 'tpr', 'threshold']
            roc_items = zip(roc_columns, roc_curve(y_true, y_pred))
            roc_df = pd.DataFrame.from_items(roc_items)
            auroc = roc_auc_score(y_true, y_pred)
            return {'auroc': auroc, 'roc_df': roc_df}

        metrics_train = get_threshold_metrics(y_train, y_pred_train)
        metrics_test = get_threshold_metrics(y_test, y_pred_test)

        # Populate the metrics dictionary.

        # Get metrics for the classifier.
        overfit = metrics_train['auroc'] - metrics_test['auroc']
        # Understand how the parameter grid worked... any params at the edge?
        if cv_pipeline.best_params_['select__k'] == min(
                param_grid['select__k']):
            n_comp_status = 'min'
        elif cv_pipeline.best_params_['select__k'] == max(
                param_grid['select__k']):
            n_comp_status = 'max'
        else:
            n_comp_status = 'OK'
        if cv_pipeline.best_params_['classify__alpha'] == min(
                param_grid['classify__alpha']):
            alpha_status = 'min'
        elif cv_pipeline.best_params_['classify__alpha'] == max(
                param_grid['classify__alpha']):
            alpha_status = 'max'
        else:
            alpha_status = 'OK'
        metrics = {
            'num_positive': num_positives,
            'train_auroc': metrics_train['auroc'],
            'test_auroc': metrics_test['auroc'],
            'n_components': cv_pipeline.best_params_['select__k'],
            'alpha': cv_pipeline.best_params_['classify__alpha'],
            'overfit': overfit,
            'n_comp_status': n_comp_status,
            'alpha_status': alpha_status
        }
        # Add the metrics to the dictonary.
        metrics_dict[gene] = metrics
    # Change the metrics dict into a formatted pandas dataframe.
    metrics_df = pd.DataFrame(metrics_dict)
    metrics_df = metrics_df.T
    metrics_df.sort_values(by='num_positive', ascending=True, inplace=True)
    metrics_df = metrics_df[[
        'num_positive', 'n_components', 'n_comp_status', 'alpha',
        'alpha_status', 'train_auroc', 'test_auroc', 'overfit'
    ]]

    return (metrics_df)
def evaluate_classifier(X_train, X_test,
                        y, y_train_allgenes, y_test_allgenes,
                        list_of_genes,
                        set_k_range, k_function,
                        alpha_range, 
                        l1_ratio):
    
    ''' Run a classifier setup on a set of queries.
    
        Loop through each query; train and test the classifier using the
        hyperparameters input as parameters; populate the metrics dictionary
        with some metrics of which parameters were selected and how well
        the classifier did for that query.
    '''
    
    # A dictionary to hold the performance metrics.
    metrics_dict = {}
    
    # Loop through each query; train and test the classifer; populate the metrics dictionary.
    for gene in list_of_genes:
        
        # Train and test the classifier.
        
        y_gene = y[gene]
        y_train = y_train_allgenes[gene]
        y_test = y_test_allgenes[gene]
        num_positives = int(y_gene.value_counts(True)[1]*len(y_gene))
        if set_k_range:
            k_range = set_k_range
        else:
            k_range = k_function(num_positives)     
        # Parameter Sweep for Hyperparameters
        param_grid = {
            'select__k': k_range,
            'classify__loss': ['log'],
            'classify__penalty': ['elasticnet'],
            'classify__alpha': alpha_range,
            'classify__l1_ratio': l1_ratio,
        }
        pipeline = Pipeline(steps=[
            ('select', SelectKBest(variance_scorer)),
            ('classify', SGDClassifier(random_state=RANDOMSEED, class_weight='balanced'))
        ])
        cv_pipeline = GridSearchCV(estimator=pipeline, 
                                   param_grid=param_grid,
                                   n_jobs=1, 
                                   scoring='roc_auc')
        cv_pipeline.fit(X=X_train, y=y_train)
        y_pred_train = cv_pipeline.decision_function(X_train)
        y_pred_test = cv_pipeline.decision_function(X_test)
        # Get ROC info.
        def get_threshold_metrics(y_true, y_pred):
            roc_columns = ['fpr', 'tpr', 'threshold']
            roc_items = zip(roc_columns, roc_curve(y_true, y_pred))
            roc_df = pd.DataFrame.from_items(roc_items)
            auroc = roc_auc_score(y_true, y_pred)
            return {'auroc': auroc, 'roc_df': roc_df}
        metrics_train = get_threshold_metrics(y_train, y_pred_train)
        metrics_test = get_threshold_metrics(y_test, y_pred_test)

        # Populate the metrics dictionary.

        # Get metrics for the classifier.
        overfit = metrics_train['auroc'] - metrics_test['auroc']
        # Understand how the parameter grid worked... any params at the edge?
        if cv_pipeline.best_params_['select__k'] == min(param_grid['select__k']):
            n_comp_status = 'min'
        elif cv_pipeline.best_params_['select__k'] == max(param_grid['select__k']):
            n_comp_status = 'max'
        else:
            n_comp_status = 'OK'
        if cv_pipeline.best_params_['classify__alpha'] == min(param_grid['classify__alpha']):
            alpha_status = 'min'
        elif cv_pipeline.best_params_['classify__alpha'] == max(param_grid['classify__alpha']):
            alpha_status = 'max'
        else:
            alpha_status = 'OK'
        metrics = {'num_positive': num_positives,
                   'train_auroc': metrics_train['auroc'], 
                   'test_auroc': metrics_test['auroc'],
                   'n_components': cv_pipeline.best_params_['select__k'], 
                   'alpha': cv_pipeline.best_params_['classify__alpha'],
                   'overfit': overfit,
                   'n_comp_status': n_comp_status,
                   'alpha_status': alpha_status
                  }
        # Add the metrics to the dictonary.
        metrics_dict[gene] = metrics
    # Change the metrics dict into a formatted pandas dataframe.
    metrics_df = pd.DataFrame(metrics_dict)
    metrics_df = metrics_df.T
    metrics_df.sort_values(by='num_positive', ascending=True, inplace=True)
    metrics_df = metrics_df[['num_positive', 'n_components','n_comp_status', 'alpha', 'alpha_status','train_auroc', 'test_auroc', 'overfit']]
    
    return(metrics_df)
Exemplo n.º 9
0
class Classifier(Base):
    '''Classifier base class.
    '''
    def __init__(self, clf, scale=False, n_jobs=1):
        super().__init__()

        # A classifier is built using a `Pipeline` for convenience of chaining
        # multiple preprocessing steps before the classifier
        pipeline = []
        # Centre data by scaling to zero mean and unit variance
        if scale is True:
            pipeline.append(('standard_scaler', StandardScaler()))
        # Add the `clf` estimator and build the `Pipeline`
        pipeline.append(('estimator', clf))
        self.clf = Pipeline(pipeline)
        self.n_jobs = n_jobs

    def __name__(self):
        return self.__class__.__name__

    def grid_search(self,
                    X,
                    y,
                    parameters,
                    scoring=None,
                    cv=5,
                    refit=True,
                    verbose=False):
        '''Perform an exhaustive search over hyperparameter combinations.

        # Arguments
            X: np.ndarray, features
            y: np.ndarray, labels
            parameters: dict, hyperparameter ranges
            scoring: dict, scoring functions e.g. {'acc': accuracy_score, ...}
            refit: bool, fit an estimator with the best parameters if True
            verbose: int, controls the verbosity: the higher, the more messages
        '''
        self.grid_search_parameters = {
            'estimator__estimator__' + k: v
            for k, v in parameters.items()
        }
        clf = self.clf

        if verbose is not True:
            warnings.filterwarnings("ignore", category=UserWarning)

        self.clf_grid_search = GridSearchCV(clf,
                                            self.grid_search_parameters,
                                            cv=cv,
                                            scoring=scoring,
                                            refit=refit,
                                            n_jobs=self.n_jobs)

        self.clf_grid_search.fit(X, y)
        print('\n`clf.best_estimator_`:\n',
              self.clf_grid_search.best_estimator_,
              '\n',
              sep='')

    def fit(self, X, y):
        '''Fit the estimator.

        # Arguments
            X: np.ndarray, features
            y: np.ndarray, labels
        '''
        # Fit classifier using the best parameters from GridSearchCV
        try:
            getattr(self.clf_grid_search, 'best_estimator_')
            fit_using = "clf_grid_search"
        except AttributeError:
            # Fit classifier from __init__
            fit_using = "clf"
            self.clf.fit(X, y)
        finally:
            print(f'\nFit using `{fit_using}`')

    def get_clf(self):
        '''Get the best estimator.

        If a grid search has been performed, then the `best_estimator_` is
        returned, else the estimator used to initialise the object is returned.

        # Returns
            clf: sklearn estimator
        '''
        try:
            return self.clf_grid_search.best_estimator_
        except AttributeError:
            return self.clf

    def predict(self, X):
        '''Predict the classes of samples using features.

        # Arguments
            X: np.ndarray, features

        # Returns
            predictions: np.ndarray, class predictions
        '''
        self.predictions = self.get_clf().predict(X)
        return self.predictions

    def predict_proba(self, X):
        '''Predict the class-membership probabilities of samples.

        # Arguments
            X: np.ndarray, features

        # Returns
            probabilities: np.ndarray, class probabilities
        '''
        self.probabilities = self.get_clf().predict_proba(X)
        return self.probabilities

    def decision_function(self, X):
        '''Decision function.

        # Arguments
            X: np.ndarray, features

        # Returns
            decisions: np.ndarray, distances of samples to the decision
                boundary
        '''
        try:
            self.decisions = self.get_clf().decision_function(X)
            return self.decisions
        except AttributeError as err:
            raise AttributeError(
                f'decision_function is not implemented for {self.__name__()}')\
                from None

    def score(self, X, y):
        '''Mean accuracy score on test data.

        # Arguments
            X: np.ndarray, test features
            y: np.ndarray, test labels
        '''
        return self.get_clf().score(X, y)

    def accuracy(self, y_true):
        '''Accuracy score.

        # Arguments
            y_true: np.ndarry, true labels

        # Returns
            accuracy_score: float
        '''
        self.accuracy_score = accuracy_score(y_true, self.predictions)
        return self.accuracy_score
def evaluate_classifier(X, y, list_of_queries, set_k_range, k_function,
                        alpha_range, l1_ratio):
    ''' Run a classifier setup on a set of queries.
    
        Loop through each query; train and test the classifier using the
        hyperparameters input as parameters; populate the metrics dictionary
        with some metrics of which parameters were selected and how well
        the classifier did for that query.
    '''

    # A dictionary to hold the performance metrics.
    metrics_dict = {}

    # Loop through each query; train and test the classifer; populate the metrics dictionary.
    for query in list_of_queries:
        num_samples = query[2]['total']
        num_positives = query[2]['positive']

        # Subset by gene.
        y_query = y[query[0]]
        # Subset by diseases.
        disease_cols = [
            col for col in covariates.columns if col.endswith(tuple(query[1]))
        ]
        has_disease = covariates[disease_cols].max(axis=1) > 0
        covariates_query = covariates[has_disease]
        X_query = X[X.index.isin(covariates_query.index)]
        y_query = y_query[y_query.index.isin(covariates_query.index)]

        # Test Train split
        test_size = 0.2
        X_train, X_test, y_train, y_test = train_test_split(
            X_query,
            y_query,
            stratify=y_query,
            test_size=test_size,
            random_state=RANDOMSEED)
        # PCA.
        scaler = StandardScaler()
        if query[2]['total'] * (1 - test_size) * (1 - (1 / 3)) > 350:
            n_comp = 350
        else:
            n_comp = int(query[2]['total'] * (1 - test_size) - 1)
        pca = PCA(n_components=n_comp, random_state=RANDOMSEED)
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        pca.fit(X_train_scaled)
        X_train = pca.transform(X_train_scaled)
        X_test_scaled = scaler.transform(X_test)
        X_test = pca.transform(X_test_scaled)

        if set_k_range:
            k_range = set_k_range
        else:
            k_range = k_function(
                num_samples=num_samples,
                num_positives=num_positives,
            )
        # Parameter Sweep for Hyperparameters
        param_grid = {
            'select__k': k_range,
            'classify__loss': ['log'],
            'classify__penalty': ['elasticnet'],
            'classify__alpha': alpha_range,
            'classify__l1_ratio': l1_ratio,
        }
        pipeline = Pipeline(steps=[('select', SelectKBest(variance_scorer)),
                                   ('classify',
                                    SGDClassifier(random_state=RANDOMSEED,
                                                  class_weight='balanced'))])
        cv_pipeline = GridSearchCV(estimator=pipeline,
                                   param_grid=param_grid,
                                   n_jobs=1,
                                   scoring='roc_auc')
        cv_pipeline.fit(X=X_train, y=y_train)
        y_pred_train = cv_pipeline.decision_function(X_train)
        y_pred_test = cv_pipeline.decision_function(X_test)

        # Get ROC info.
        def get_threshold_metrics(y_true, y_pred):
            roc_columns = ['fpr', 'tpr', 'threshold']
            roc_items = zip(roc_columns, roc_curve(y_true, y_pred))
            roc_df = pd.DataFrame.from_items(roc_items)
            auroc = roc_auc_score(y_true, y_pred)
            return {'auroc': auroc, 'roc_df': roc_df}

        metrics_train = get_threshold_metrics(y_train, y_pred_train)
        metrics_test = get_threshold_metrics(y_test, y_pred_test)

        # Populate the metrics dictionary.
        # Get metrics for the classifier.
        overfit = metrics_train['auroc'] - metrics_test['auroc']
        # Understand how the parameter grid worked... any params at the edge?
        if cv_pipeline.best_params_['select__k'] == min(
                param_grid['select__k']):
            n_comp_status = 'min'
        elif cv_pipeline.best_params_['select__k'] == max(
                param_grid['select__k']):
            n_comp_status = 'max'
        else:
            n_comp_status = 'OK'
        if cv_pipeline.best_params_['classify__alpha'] == min(
                param_grid['classify__alpha']):
            alpha_status = 'min'
        elif cv_pipeline.best_params_['classify__alpha'] == max(
                param_grid['classify__alpha']):
            alpha_status = 'max'
        else:
            alpha_status = 'OK'
        metrics = {
            'num_samples': num_samples,
            'num_positive': num_positives,
            'balance': num_positives / num_samples,
            'train_auroc': metrics_train['auroc'],
            'test_auroc': metrics_test['auroc'],
            'n_components': cv_pipeline.best_params_['select__k'],
            'alpha': cv_pipeline.best_params_['classify__alpha'],
            'overfit': overfit,
            'n_comp_status': n_comp_status,
            'alpha_status': alpha_status
        }
        # Add the metrics to the dictonary.
        metrics_dict[query[0] + str(query[2]['total'])] = metrics
    # Change the metrics dict into a formatted pandas dataframe.
    metrics_df = pd.DataFrame(metrics_dict)
    metrics_df = metrics_df.T
    metrics_df.sort_values(by='num_positive', ascending=True, inplace=True)
    metrics_df = metrics_df[[
        'num_samples', 'num_positive', 'balance', 'n_components',
        'n_comp_status', 'alpha', 'alpha_status', 'train_auroc', 'test_auroc',
        'overfit'
    ]]

    return (metrics_df)
Exemplo n.º 11
0
                           n_jobs=-1,
                           cv=n_folds,
                           scoring='average_precision',
                           return_train_score=True)

shuffle_cv_pipeline = GridSearchCV(estimator=estimator,
                                   param_grid=clf_parameters,
                                   n_jobs=-1,
                                   cv=n_folds,
                                   scoring='average_precision',
                                   return_train_score=True)

# In[10]:

# Fit Regular Pipeline
cv_pipeline.fit(X=x_train_df, y=y_train_df.status.tolist())

# In[11]:

# Fit Shuffled Data Pipeline
x_train_shuffled_df = x_train_df.apply(shuffle_columns,
                                       axis=0,
                                       result_type="broadcast")
shuffle_cv_pipeline.fit(X=x_train_shuffled_df, y=y_train_df.status.tolist())

# ## Visualize Cross Validation Results

# In[12]:

cv_heatmap_file = pathlib.Path("figures", "cross_validation",
                               "cv_example_heatmap.png")