def main():
    param_space = {
        'C': [1e-4, 1, 1e4],
        'gamma': [1e-3, 1, 1e3],
        'class_weight': [None, 'balanced']
    }

    model = SVC(kernel='rbf')

    digits = load_digits()

    X_train, X_test, y_train, y_test = tts(digits.data,
                                           digits.target,
                                           test_size=0.3)

    print("Starting local cluster")
    cluster = LocalCluster()
    client = Client(cluster)
    print(client)

    print("Start searching")
    search = GridSearchCV(model, param_space, cv=3)
    search.fit(X_train, y_train)

    print("Prepare report")
    print(
        classification_report(y_true=y_test,
                              y_pred=search.best_estimator_.predict(X_test)))
예제 #2
0
def getprobabilities(X, y, p_grid, cv, ac_sens):
    """getprobabilities(X,y,p_grid, cv, ac_sens)
    
    - X and y: Inputs and outputs
    - p_grid: grid of parameters to search over
    - cv: Number of cross-validation folds (this is different from the outer number of x-val folds)
    
    Gets the probability of picking each of the options provided by p_grid given the data in X and y.
    The algorithm is as follows:
     - Find the sensitivity of the SSE for each parameter-values
     - Find the SSE of each parameter-values
     - Find the probability of selecting those parameter-values
    """
    kern = GPy.kern.RBF(2.0, lengthscale=25.0, variance=1.0)
    errorlimit = ac_sens * 4.0

    ####find sensitivity of the SSE (for each param combo)
    #this call gets the sensivities not the scores:
    #TODO This probably should be done locally as it's quick.
    clf = GridSearchCV(estimator=DPCloaking(sensitivity=ac_sens,
                                            inducing=4,
                                            getxvalfoldsensitivities=True,
                                            kern=kern,
                                            errorlimit=errorlimit),
                       param_grid=p_grid,
                       cv=cv)
    clf.fit(X, y)

    nparamcombos = len(clf.cv_results_['mean_test_score'])
    temp_sens = np.zeros([clf.cv, nparamcombos])
    for k in range(clf.cv):
        temp_sens[k, :] = clf.cv_results_['split%d_test_score' % k]
    #sensitivity of the sum squared error:
    print(np.sort(temp_sens, axis=0))
    sse_sens = ac_sens**2 + 2 * ac_sens * errorlimit + ac_sens**2 * np.max(
        np.sum(np.sort(temp_sens, axis=0)[0:clf.cv - 1, :], 0))

    ####find the SSE (for each param combo)
    clf = GridSearchCV(estimator=DPCloaking(sensitivity=ac_sens,
                                            inducing=4,
                                            getxvalfoldsensitivities=False,
                                            kern=kern,
                                            errorlimit=errorlimit),
                       param_grid=p_grid,
                       cv=cv)
    clf.fit(X, y)

    nparamcombos = len(clf.cv_results_['mean_test_score'])
    temp_scores = np.zeros([clf.cv, nparamcombos])
    for k in range(clf.cv):
        temp_scores[k, :] = clf.cv_results_['split%d_test_score' % k]
    scores = np.sum(temp_scores, 0)

    ####compute the probability of selecting that param combo using the exponential mechanism
    selection_epsilon = 1
    param_probabilities = np.exp(selection_epsilon * scores / (2 * sse_sens))
    param_probabilities = param_probabilities / np.sum(param_probabilities)

    return param_probabilities
예제 #3
0
def getscores(X, y, p_grid, cv, ac_sens):
    """
    Compute the negative RMSE of each of the fold/param combos
    """
    kern = GPy.kern.RBF(2.0, lengthscale=25.0, variance=1.0)

    clf = GridSearchCV(estimator=DPCloaking(sensitivity=ac_sens,
                                            inducing=4,
                                            getxvalfoldsensitivities=False,
                                            kern=kern),
                       scoring='neg_mean_squared_error',
                       param_grid=p_grid,
                       cv=cv)
    clf.fit(X, y)

    nparamcombos = len(clf.cv_results_['mean_test_score'])
    scores = np.zeros([clf.cv, nparamcombos])
    for k in range(clf.cv):
        scores[k, :] = clf.cv_results_['split%d_test_score' % k]
    return scores
예제 #4
0
    def grid_search(self,
                    X,
                    y,
                    parameters,
                    scoring=None,
                    cv=5,
                    refit=True,
                    verbose=False):
        '''Perform an exhaustive search over hyperparameter combinations.

        # Arguments
            X: np.ndarray, features
            y: np.ndarray, labels
            parameters: dict, hyperparameter ranges
            scoring: dict, scoring functions e.g. {'acc': accuracy_score, ...}
            refit: bool, fit an estimator with the best parameters if True
            verbose: int, controls the verbosity: the higher, the more messages
        '''
        self.grid_search_parameters = {
            'estimator__estimator__' + k: v
            for k, v in parameters.items()
        }
        clf = self.clf

        if verbose is not True:
            warnings.filterwarnings("ignore", category=UserWarning)

        self.clf_grid_search = GridSearchCV(clf,
                                            self.grid_search_parameters,
                                            cv=cv,
                                            scoring=scoring,
                                            refit=refit,
                                            n_jobs=self.n_jobs)

        self.clf_grid_search.fit(X, y)
        print('\n`clf.best_estimator_`:\n',
              self.clf_grid_search.best_estimator_,
              '\n',
              sep='')
예제 #5
0
def main():

    param_space = {
        "C": [1e-4, 1, 1e4],
        "gamma": [1e-3, 1, 1e3],
        "class_weight": [None, "balanced"],
        "kernel": ["linear", "poly", "rbf", "sigmoid"]
    }
    model = SVC()

    #     param_space = {"n_estimators":[100, 200, 300, 400, 500],
    #                                    "criterion":["gini", "entropy"],
    #                                    "max_features":["auto", "sqrt", "log2"],
    #                                    "max_depth":[2, 3, 4, 5, 6, 7, 8]}
    #     model = RandomForestClassifier()

    digits = load_digits()

    #     classifier = GridSearchCV(model, param_space, n_jobs=-1, cv=5)

    classifier = GridSearchCV(model, param_space, n_jobs=-1, cv=5)

    classifier.fit(digits.data, digits.target)

    print("Grid Scores:")
    means = classifier.cv_results_["mean_test_score"]
    standard_deviations = classifier.cv_results_["std_test_score"]
    for mean, standard_deviation, parameter in zip(
            means, standard_deviations, classifier.cv_results_["params"]):
        print("%0.3f (+/-%0.03f) for %r" %
              (mean, standard_deviation * 2, parameter))
    print()

    print("Best Score: %0.3f" % (classifier.best_score_))
    print()
    print("Best Parameters:")
    print(classifier.best_params_)
    print()
def evaluate_classifier(X,
                        y,
                        list_of_queries,
                        set_k_range, k_function,
                        alpha_range, 
                        l1_ratio):
    
    ''' Run a classifier setup on a set of queries.
    
        Loop through each query; train and test the classifier using the
        hyperparameters input as parameters; populate the metrics dictionary
        with some metrics of which parameters were selected and how well
        the classifier did for that query.
    '''
    
    # A dictionary to hold the performance metrics.
    metrics_dict = {}
    
    # Loop through each query; train and test the classifer; populate the metrics dictionary.
    for query in list_of_queries:
        num_samples = query[2]['total']
        num_positives = query[2]['positive']
        
        # Subset by gene.
        y_query = y[query[0]]
        # Subset by diseases.
        disease_cols = [col for col in covariates.columns if col.endswith(tuple(query[1]))]
        has_disease = covariates[disease_cols].max(axis=1) > 0
        covariates_query = covariates[has_disease]
        X_query = X[X.index.isin(covariates_query.index)]
        y_query = y_query[y_query.index.isin(covariates_query.index)]
                
        # Test Train split
        test_size = 0.2
        X_train, X_test, y_train, y_test = train_test_split(X_query, y_query, stratify=y_query, test_size=test_size, random_state=RANDOMSEED)
        # PCA.
        scaler = StandardScaler()
        if query[2]['total']*(1-test_size)*(1-(1/3)) > 350:
            n_comp = 350
        else:
            n_comp = int(query[2]['total']*(1-test_size) - 1)
        pca = PCA(n_components = n_comp, random_state = RANDOMSEED)
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        pca.fit(X_train_scaled)
        X_train = pca.transform(X_train_scaled)
        X_test_scaled = scaler.transform(X_test)
        X_test = pca.transform(X_test_scaled)
        
        if set_k_range:
            k_range = set_k_range
        else:
            k_range = k_function(num_samples=num_samples,
                                 num_positives=num_positives,
                                 )     
        # Parameter Sweep for Hyperparameters
        param_grid = {
            'select__k': k_range,
            'classify__loss': ['log'],
            'classify__penalty': ['elasticnet'],
            'classify__alpha': alpha_range,
            'classify__l1_ratio': l1_ratio,
        }
        pipeline = Pipeline(steps=[
            ('select', SelectKBest(variance_scorer)),
            ('classify', SGDClassifier(random_state=RANDOMSEED, class_weight='balanced'))
        ])
        cv_pipeline = GridSearchCV(estimator=pipeline, 
                                   param_grid=param_grid,
                                   n_jobs=1, 
                                   scoring='roc_auc')
        cv_pipeline.fit(X=X_train, y=y_train)
        y_pred_train = cv_pipeline.decision_function(X_train)
        y_pred_test = cv_pipeline.decision_function(X_test)
        # Get ROC info.
        def get_threshold_metrics(y_true, y_pred):
            roc_columns = ['fpr', 'tpr', 'threshold']
            roc_items = zip(roc_columns, roc_curve(y_true, y_pred))
            roc_df = pd.DataFrame.from_items(roc_items)
            auroc = roc_auc_score(y_true, y_pred)
            return {'auroc': auroc, 'roc_df': roc_df}
        metrics_train = get_threshold_metrics(y_train, y_pred_train)
        metrics_test = get_threshold_metrics(y_test, y_pred_test)

        # Populate the metrics dictionary.
        # Get metrics for the classifier.
        overfit = metrics_train['auroc'] - metrics_test['auroc']
        # Understand how the parameter grid worked... any params at the edge?
        if cv_pipeline.best_params_['select__k'] == min(param_grid['select__k']):
            n_comp_status = 'min'
        elif cv_pipeline.best_params_['select__k'] == max(param_grid['select__k']):
            n_comp_status = 'max'
        else:
            n_comp_status = 'OK'
        if cv_pipeline.best_params_['classify__alpha'] == min(param_grid['classify__alpha']):
            alpha_status = 'min'
        elif cv_pipeline.best_params_['classify__alpha'] == max(param_grid['classify__alpha']):
            alpha_status = 'max'
        else:
            alpha_status = 'OK'
        metrics = {'num_samples': num_samples,
                   'num_positive': num_positives,
                   'balance': num_positives/num_samples,
                   'train_auroc': metrics_train['auroc'], 
                   'test_auroc': metrics_test['auroc'],
                   'n_components': cv_pipeline.best_params_['select__k'], 
                   'alpha': cv_pipeline.best_params_['classify__alpha'],
                   'overfit': overfit,
                   'n_comp_status': n_comp_status,
                   'alpha_status': alpha_status
                  }
        # Add the metrics to the dictonary.
        metrics_dict[query[0]+str(query[2]['total'])] = metrics
    # Change the metrics dict into a formatted pandas dataframe.
    metrics_df = pd.DataFrame(metrics_dict)
    metrics_df = metrics_df.T
    metrics_df.sort_values(by='num_positive', ascending=True, inplace=True)
    metrics_df = metrics_df[['num_samples', 'num_positive', 'balance', 'n_components','n_comp_status', 'alpha', 'alpha_status','train_auroc', 'test_auroc', 'overfit']]
    
    return(metrics_df)
예제 #7
0
                                                    stratify=strat)

clf_parameters = {
    'classify__loss': ['log'],
    'classify__penalty': ['elasticnet'],
    'classify__alpha': alphas,
    'classify__l1_ratio': l1_ratios
}

estimator = Pipeline(steps=[(
    'classify',
    SGDClassifier(random_state=0, class_weight='balanced', loss='log'))])

cv_pipeline = GridSearchCV(estimator=estimator,
                           param_grid=clf_parameters,
                           n_jobs=-1,
                           cv=folds,
                           scoring='roc_auc')
cv_pipeline.fit(X=x_train, y=y_train)

cv_results = pd.concat([
    pd.DataFrame(cv_pipeline.cv_results_).drop('params', axis=1),
    pd.DataFrame.from_records(cv_pipeline.cv_results_['params'])
],
                       axis=1)

# Cross-validated performance heatmap
cv_score_mat = pd.pivot_table(cv_results,
                              values='mean_test_score',
                              index='classify__l1_ratio',
                              columns='classify__alpha')
예제 #8
0
    'expressions':
    Pipeline([('features', FeatureUnion([('expressions', expression_features)
                                         ])), ('classify', classifier)]),
    'covariates':
    Pipeline([('features', FeatureUnion([('covariates', covariate_features)])),
              ('classify', classifier)])
}

# Construct cross-validated grid searches
cv_pipelines = dict()
for model, pipeline in pipeline_definitions.items():
    cv = StratifiedKFold(n_splits=3, random_state=0)
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[model],
        cv=cv,
        n_jobs=1,
        scoring='roc_auc',
    )
    cv_pipelines[model] = grid_search

# In[13]:

# Fit the models
for model, pipeline in cv_pipelines.items():
    print('Fitting CV for model: {0}'.format(model))
    start_time = time.perf_counter()
    pipeline.fit(X=X_train, y=y_train)
    end_time = time.perf_counter()
    elapsed = datetime.timedelta(seconds=end_time - start_time)
    print('\truntime: {}'.format(elapsed))
def evaluate_classifier(X_train, X_test, y, y_train_allgenes, y_test_allgenes,
                        list_of_genes, set_k_range, k_function, alpha_range,
                        l1_ratio):
    ''' Run a classifier setup on a set of queries.
    
        Loop through each query; train and test the classifier using the
        hyperparameters input as parameters; populate the metrics dictionary
        with some metrics of which parameters were selected and how well
        the classifier did for that query.
    '''

    # A dictionary to hold the performance metrics.
    metrics_dict = {}

    # Loop through each query; train and test the classifer; populate the metrics dictionary.
    for gene in list_of_genes:

        # Train and test the classifier.

        y_gene = y[gene]
        y_train = y_train_allgenes[gene]
        y_test = y_test_allgenes[gene]
        num_positives = int(y_gene.value_counts(True)[1] * len(y_gene))
        if set_k_range:
            k_range = set_k_range
        else:
            k_range = k_function(num_positives)
        # Parameter Sweep for Hyperparameters
        param_grid = {
            'select__k': k_range,
            'classify__loss': ['log'],
            'classify__penalty': ['elasticnet'],
            'classify__alpha': alpha_range,
            'classify__l1_ratio': l1_ratio,
        }
        pipeline = Pipeline(steps=[('select', SelectKBest(variance_scorer)),
                                   ('classify',
                                    SGDClassifier(random_state=RANDOMSEED,
                                                  class_weight='balanced'))])
        cv_pipeline = GridSearchCV(estimator=pipeline,
                                   param_grid=param_grid,
                                   n_jobs=1,
                                   scoring='roc_auc')
        cv_pipeline.fit(X=X_train, y=y_train)
        y_pred_train = cv_pipeline.decision_function(X_train)
        y_pred_test = cv_pipeline.decision_function(X_test)

        # Get ROC info.
        def get_threshold_metrics(y_true, y_pred):
            roc_columns = ['fpr', 'tpr', 'threshold']
            roc_items = zip(roc_columns, roc_curve(y_true, y_pred))
            roc_df = pd.DataFrame.from_items(roc_items)
            auroc = roc_auc_score(y_true, y_pred)
            return {'auroc': auroc, 'roc_df': roc_df}

        metrics_train = get_threshold_metrics(y_train, y_pred_train)
        metrics_test = get_threshold_metrics(y_test, y_pred_test)

        # Populate the metrics dictionary.

        # Get metrics for the classifier.
        overfit = metrics_train['auroc'] - metrics_test['auroc']
        # Understand how the parameter grid worked... any params at the edge?
        if cv_pipeline.best_params_['select__k'] == min(
                param_grid['select__k']):
            n_comp_status = 'min'
        elif cv_pipeline.best_params_['select__k'] == max(
                param_grid['select__k']):
            n_comp_status = 'max'
        else:
            n_comp_status = 'OK'
        if cv_pipeline.best_params_['classify__alpha'] == min(
                param_grid['classify__alpha']):
            alpha_status = 'min'
        elif cv_pipeline.best_params_['classify__alpha'] == max(
                param_grid['classify__alpha']):
            alpha_status = 'max'
        else:
            alpha_status = 'OK'
        metrics = {
            'num_positive': num_positives,
            'train_auroc': metrics_train['auroc'],
            'test_auroc': metrics_test['auroc'],
            'n_components': cv_pipeline.best_params_['select__k'],
            'alpha': cv_pipeline.best_params_['classify__alpha'],
            'overfit': overfit,
            'n_comp_status': n_comp_status,
            'alpha_status': alpha_status
        }
        # Add the metrics to the dictonary.
        metrics_dict[gene] = metrics
    # Change the metrics dict into a formatted pandas dataframe.
    metrics_df = pd.DataFrame(metrics_dict)
    metrics_df = metrics_df.T
    metrics_df.sort_values(by='num_positive', ascending=True, inplace=True)
    metrics_df = metrics_df[[
        'num_positive', 'n_components', 'n_comp_status', 'alpha',
        'alpha_status', 'train_auroc', 'test_auroc', 'overfit'
    ]]

    return (metrics_df)
def evaluate_classifier(X_train, X_test,
                        y, y_train_allgenes, y_test_allgenes,
                        list_of_genes,
                        set_k_range, k_function,
                        alpha_range, 
                        l1_ratio):
    
    ''' Run a classifier setup on a set of queries.
    
        Loop through each query; train and test the classifier using the
        hyperparameters input as parameters; populate the metrics dictionary
        with some metrics of which parameters were selected and how well
        the classifier did for that query.
    '''
    
    # A dictionary to hold the performance metrics.
    metrics_dict = {}
    
    # Loop through each query; train and test the classifer; populate the metrics dictionary.
    for gene in list_of_genes:
        
        # Train and test the classifier.
        
        y_gene = y[gene]
        y_train = y_train_allgenes[gene]
        y_test = y_test_allgenes[gene]
        num_positives = int(y_gene.value_counts(True)[1]*len(y_gene))
        if set_k_range:
            k_range = set_k_range
        else:
            k_range = k_function(num_positives)     
        # Parameter Sweep for Hyperparameters
        param_grid = {
            'select__k': k_range,
            'classify__loss': ['log'],
            'classify__penalty': ['elasticnet'],
            'classify__alpha': alpha_range,
            'classify__l1_ratio': l1_ratio,
        }
        pipeline = Pipeline(steps=[
            ('select', SelectKBest(variance_scorer)),
            ('classify', SGDClassifier(random_state=RANDOMSEED, class_weight='balanced'))
        ])
        cv_pipeline = GridSearchCV(estimator=pipeline, 
                                   param_grid=param_grid,
                                   n_jobs=1, 
                                   scoring='roc_auc')
        cv_pipeline.fit(X=X_train, y=y_train)
        y_pred_train = cv_pipeline.decision_function(X_train)
        y_pred_test = cv_pipeline.decision_function(X_test)
        # Get ROC info.
        def get_threshold_metrics(y_true, y_pred):
            roc_columns = ['fpr', 'tpr', 'threshold']
            roc_items = zip(roc_columns, roc_curve(y_true, y_pred))
            roc_df = pd.DataFrame.from_items(roc_items)
            auroc = roc_auc_score(y_true, y_pred)
            return {'auroc': auroc, 'roc_df': roc_df}
        metrics_train = get_threshold_metrics(y_train, y_pred_train)
        metrics_test = get_threshold_metrics(y_test, y_pred_test)

        # Populate the metrics dictionary.

        # Get metrics for the classifier.
        overfit = metrics_train['auroc'] - metrics_test['auroc']
        # Understand how the parameter grid worked... any params at the edge?
        if cv_pipeline.best_params_['select__k'] == min(param_grid['select__k']):
            n_comp_status = 'min'
        elif cv_pipeline.best_params_['select__k'] == max(param_grid['select__k']):
            n_comp_status = 'max'
        else:
            n_comp_status = 'OK'
        if cv_pipeline.best_params_['classify__alpha'] == min(param_grid['classify__alpha']):
            alpha_status = 'min'
        elif cv_pipeline.best_params_['classify__alpha'] == max(param_grid['classify__alpha']):
            alpha_status = 'max'
        else:
            alpha_status = 'OK'
        metrics = {'num_positive': num_positives,
                   'train_auroc': metrics_train['auroc'], 
                   'test_auroc': metrics_test['auroc'],
                   'n_components': cv_pipeline.best_params_['select__k'], 
                   'alpha': cv_pipeline.best_params_['classify__alpha'],
                   'overfit': overfit,
                   'n_comp_status': n_comp_status,
                   'alpha_status': alpha_status
                  }
        # Add the metrics to the dictonary.
        metrics_dict[gene] = metrics
    # Change the metrics dict into a formatted pandas dataframe.
    metrics_df = pd.DataFrame(metrics_dict)
    metrics_df = metrics_df.T
    metrics_df.sort_values(by='num_positive', ascending=True, inplace=True)
    metrics_df = metrics_df[['num_positive', 'n_components','n_comp_status', 'alpha', 'alpha_status','train_auroc', 'test_auroc', 'overfit']]
    
    return(metrics_df)
    if(model_choice == 'random forests'):
        print("\n\n")
        print("\t\t Special Grid Search for random foreset")
        print("\n\n")
        c = Client(scheduler_address, set_as_default=True)
        grid_search = DaskRandomizedSearchCV(
            model,
            param_grid,
            cv = cv_temp,
            get= c.get
        )
    else:
        c = Client(scheduler_address, set_as_default=True)
        grid_search = GridSearchCV(
            model,
            param_grid,
            cv = cv_temp
        )

    grid_search.fit(X, y)
    time_elapse = time()-t0
    runing_time.append([model_choice, cv_temp, time_elapse])
    print(" running time: ", time_elapse)
    num_graph = len(grid_search.dask_graph_)
    print(" size of graph: ", num_graph)

    
    runing_time_df = pd.DataFrame(data = runing_time, columns = ['model', 'cv', 'time'])
    runing_time_df['n_workers'] = n_workers
    runing_time_df['n_graph'] = num_graph
    runing_time_df['sample'] = sample
예제 #12
0
class Classifier(Base):
    '''Classifier base class.
    '''
    def __init__(self, clf, scale=False, n_jobs=1):
        super().__init__()

        # A classifier is built using a `Pipeline` for convenience of chaining
        # multiple preprocessing steps before the classifier
        pipeline = []
        # Centre data by scaling to zero mean and unit variance
        if scale is True:
            pipeline.append(('standard_scaler', StandardScaler()))
        # Add the `clf` estimator and build the `Pipeline`
        pipeline.append(('estimator', clf))
        self.clf = Pipeline(pipeline)
        self.n_jobs = n_jobs

    def __name__(self):
        return self.__class__.__name__

    def grid_search(self,
                    X,
                    y,
                    parameters,
                    scoring=None,
                    cv=5,
                    refit=True,
                    verbose=False):
        '''Perform an exhaustive search over hyperparameter combinations.

        # Arguments
            X: np.ndarray, features
            y: np.ndarray, labels
            parameters: dict, hyperparameter ranges
            scoring: dict, scoring functions e.g. {'acc': accuracy_score, ...}
            refit: bool, fit an estimator with the best parameters if True
            verbose: int, controls the verbosity: the higher, the more messages
        '''
        self.grid_search_parameters = {
            'estimator__estimator__' + k: v
            for k, v in parameters.items()
        }
        clf = self.clf

        if verbose is not True:
            warnings.filterwarnings("ignore", category=UserWarning)

        self.clf_grid_search = GridSearchCV(clf,
                                            self.grid_search_parameters,
                                            cv=cv,
                                            scoring=scoring,
                                            refit=refit,
                                            n_jobs=self.n_jobs)

        self.clf_grid_search.fit(X, y)
        print('\n`clf.best_estimator_`:\n',
              self.clf_grid_search.best_estimator_,
              '\n',
              sep='')

    def fit(self, X, y):
        '''Fit the estimator.

        # Arguments
            X: np.ndarray, features
            y: np.ndarray, labels
        '''
        # Fit classifier using the best parameters from GridSearchCV
        try:
            getattr(self.clf_grid_search, 'best_estimator_')
            fit_using = "clf_grid_search"
        except AttributeError:
            # Fit classifier from __init__
            fit_using = "clf"
            self.clf.fit(X, y)
        finally:
            print(f'\nFit using `{fit_using}`')

    def get_clf(self):
        '''Get the best estimator.

        If a grid search has been performed, then the `best_estimator_` is
        returned, else the estimator used to initialise the object is returned.

        # Returns
            clf: sklearn estimator
        '''
        try:
            return self.clf_grid_search.best_estimator_
        except AttributeError:
            return self.clf

    def predict(self, X):
        '''Predict the classes of samples using features.

        # Arguments
            X: np.ndarray, features

        # Returns
            predictions: np.ndarray, class predictions
        '''
        self.predictions = self.get_clf().predict(X)
        return self.predictions

    def predict_proba(self, X):
        '''Predict the class-membership probabilities of samples.

        # Arguments
            X: np.ndarray, features

        # Returns
            probabilities: np.ndarray, class probabilities
        '''
        self.probabilities = self.get_clf().predict_proba(X)
        return self.probabilities

    def decision_function(self, X):
        '''Decision function.

        # Arguments
            X: np.ndarray, features

        # Returns
            decisions: np.ndarray, distances of samples to the decision
                boundary
        '''
        try:
            self.decisions = self.get_clf().decision_function(X)
            return self.decisions
        except AttributeError as err:
            raise AttributeError(
                f'decision_function is not implemented for {self.__name__()}')\
                from None

    def score(self, X, y):
        '''Mean accuracy score on test data.

        # Arguments
            X: np.ndarray, test features
            y: np.ndarray, test labels
        '''
        return self.get_clf().score(X, y)

    def accuracy(self, y_true):
        '''Accuracy score.

        # Arguments
            y_true: np.ndarry, true labels

        # Returns
            accuracy_score: float
        '''
        self.accuracy_score = accuracy_score(y_true, self.predictions)
        return self.accuracy_score
예제 #13
0
clf_parameters = {'classify__C': cs, 'classify__penalty': penalties}

estimator = Pipeline(steps=[('classify',
                             LogisticRegression(random_state=123,
                                                class_weight='balanced',
                                                multi_class='ovr',
                                                max_iter=100,
                                                solver='saga'))])

# Custom scorer that optimizes f1 score weighted by class proportion
weighted_f1_scorer = make_scorer(f1_score, average='weighted')

# Cross validation pipeline
cv_pipeline = GridSearchCV(estimator=estimator,
                           param_grid=clf_parameters,
                           n_jobs=-1,
                           cv=5,
                           return_train_score=True,
                           scoring=weighted_f1_scorer)

# ### Fit Model
#
# This takes a couple minutes to train. For many model parameters, sklearn will throw convergence warnings. This means that after 100 iterations, an optimal solution is not found.  Prevent the warnings from being printed redundantly.

# In[8]:

get_ipython().run_cell_magic(
    'time', '',
    'with warnings.catch_warnings():\n    warnings.simplefilter("ignore")\n    cv_pipeline.fit(X=x_df, y=y_df.ras_status)'
)

# In[9]:
    'expressions':
    Pipeline([('features', FeatureUnion([('expressions', expression_features)
                                         ])), ('classify', classifier)]),
    'covariates':
    Pipeline([('features', FeatureUnion([('covariates', covariate_features)])),
              ('classify', classifier)])
}

# Construct cross-validated grid searches
cv_pipelines = dict()
for model, pipeline in pipeline_definitions.items():
    cv = StratifiedKFold(n_splits=3, random_state=0)
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[model],
        cv=cv,
        n_jobs=1,
        scoring='roc_auc',
    )
    cv_pipelines[model] = grid_search

# In[13]:

# Fit the models
for model, pipeline in cv_pipelines.items():
    print('Fitting CV for model: {0}'.format(model))
    start_time = time.perf_counter()
    pipeline.fit(X=X_train, y=y_train)
    end_time = time.perf_counter()
    elapsed = datetime.timedelta(seconds=end_time - start_time)
    print('\truntime: {}'.format(elapsed))
def evaluate_classifier(X, y, list_of_queries, set_k_range, k_function,
                        alpha_range, l1_ratio):
    ''' Run a classifier setup on a set of queries.
    
        Loop through each query; train and test the classifier using the
        hyperparameters input as parameters; populate the metrics dictionary
        with some metrics of which parameters were selected and how well
        the classifier did for that query.
    '''

    # A dictionary to hold the performance metrics.
    metrics_dict = {}

    # Loop through each query; train and test the classifer; populate the metrics dictionary.
    for query in list_of_queries:
        num_samples = query[2]['total']
        num_positives = query[2]['positive']

        # Subset by gene.
        y_query = y[query[0]]
        # Subset by diseases.
        disease_cols = [
            col for col in covariates.columns if col.endswith(tuple(query[1]))
        ]
        has_disease = covariates[disease_cols].max(axis=1) > 0
        covariates_query = covariates[has_disease]
        X_query = X[X.index.isin(covariates_query.index)]
        y_query = y_query[y_query.index.isin(covariates_query.index)]

        # Test Train split
        test_size = 0.2
        X_train, X_test, y_train, y_test = train_test_split(
            X_query,
            y_query,
            stratify=y_query,
            test_size=test_size,
            random_state=RANDOMSEED)
        # PCA.
        scaler = StandardScaler()
        if query[2]['total'] * (1 - test_size) * (1 - (1 / 3)) > 350:
            n_comp = 350
        else:
            n_comp = int(query[2]['total'] * (1 - test_size) - 1)
        pca = PCA(n_components=n_comp, random_state=RANDOMSEED)
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        pca.fit(X_train_scaled)
        X_train = pca.transform(X_train_scaled)
        X_test_scaled = scaler.transform(X_test)
        X_test = pca.transform(X_test_scaled)

        if set_k_range:
            k_range = set_k_range
        else:
            k_range = k_function(
                num_samples=num_samples,
                num_positives=num_positives,
            )
        # Parameter Sweep for Hyperparameters
        param_grid = {
            'select__k': k_range,
            'classify__loss': ['log'],
            'classify__penalty': ['elasticnet'],
            'classify__alpha': alpha_range,
            'classify__l1_ratio': l1_ratio,
        }
        pipeline = Pipeline(steps=[('select', SelectKBest(variance_scorer)),
                                   ('classify',
                                    SGDClassifier(random_state=RANDOMSEED,
                                                  class_weight='balanced'))])
        cv_pipeline = GridSearchCV(estimator=pipeline,
                                   param_grid=param_grid,
                                   n_jobs=1,
                                   scoring='roc_auc')
        cv_pipeline.fit(X=X_train, y=y_train)
        y_pred_train = cv_pipeline.decision_function(X_train)
        y_pred_test = cv_pipeline.decision_function(X_test)

        # Get ROC info.
        def get_threshold_metrics(y_true, y_pred):
            roc_columns = ['fpr', 'tpr', 'threshold']
            roc_items = zip(roc_columns, roc_curve(y_true, y_pred))
            roc_df = pd.DataFrame.from_items(roc_items)
            auroc = roc_auc_score(y_true, y_pred)
            return {'auroc': auroc, 'roc_df': roc_df}

        metrics_train = get_threshold_metrics(y_train, y_pred_train)
        metrics_test = get_threshold_metrics(y_test, y_pred_test)

        # Populate the metrics dictionary.
        # Get metrics for the classifier.
        overfit = metrics_train['auroc'] - metrics_test['auroc']
        # Understand how the parameter grid worked... any params at the edge?
        if cv_pipeline.best_params_['select__k'] == min(
                param_grid['select__k']):
            n_comp_status = 'min'
        elif cv_pipeline.best_params_['select__k'] == max(
                param_grid['select__k']):
            n_comp_status = 'max'
        else:
            n_comp_status = 'OK'
        if cv_pipeline.best_params_['classify__alpha'] == min(
                param_grid['classify__alpha']):
            alpha_status = 'min'
        elif cv_pipeline.best_params_['classify__alpha'] == max(
                param_grid['classify__alpha']):
            alpha_status = 'max'
        else:
            alpha_status = 'OK'
        metrics = {
            'num_samples': num_samples,
            'num_positive': num_positives,
            'balance': num_positives / num_samples,
            'train_auroc': metrics_train['auroc'],
            'test_auroc': metrics_test['auroc'],
            'n_components': cv_pipeline.best_params_['select__k'],
            'alpha': cv_pipeline.best_params_['classify__alpha'],
            'overfit': overfit,
            'n_comp_status': n_comp_status,
            'alpha_status': alpha_status
        }
        # Add the metrics to the dictonary.
        metrics_dict[query[0] + str(query[2]['total'])] = metrics
    # Change the metrics dict into a formatted pandas dataframe.
    metrics_df = pd.DataFrame(metrics_dict)
    metrics_df = metrics_df.T
    metrics_df.sort_values(by='num_positive', ascending=True, inplace=True)
    metrics_df = metrics_df[[
        'num_samples', 'num_positive', 'balance', 'n_components',
        'n_comp_status', 'alpha', 'alpha_status', 'train_auroc', 'test_auroc',
        'overfit'
    ]]

    return (metrics_df)
예제 #16
0
    'classify__loss': ['log'],
    'classify__penalty': ['elasticnet'],
    'classify__alpha': alphas,
    'classify__l1_ratio': l1_ratios
}

estimator = Pipeline(steps=[('classify',
                             SGDClassifier(random_state=0,
                                           class_weight='balanced',
                                           loss='log',
                                           max_iter=50,
                                           tol=1e-3))])

cv_pipeline = GridSearchCV(estimator=estimator,
                           param_grid=clf_parameters,
                           n_jobs=-1,
                           cv=n_folds,
                           scoring='average_precision',
                           return_train_score=True)

shuffle_cv_pipeline = GridSearchCV(estimator=estimator,
                                   param_grid=clf_parameters,
                                   n_jobs=-1,
                                   cv=n_folds,
                                   scoring='average_precision',
                                   return_train_score=True)

# In[10]:

# Fit Regular Pipeline
cv_pipeline.fit(X=x_train_df, y=y_train_df.status.tolist())