Exemplo n.º 1
0
                       axis=1)

# Cross-validated performance heatmap
cv_score_mat = pd.pivot_table(cv_results,
                              values='mean_test_score',
                              index='classify__l1_ratio',
                              columns='classify__alpha')
ax = sns.heatmap(cv_score_mat, annot=True, fmt='.1%')
ax.set_xlabel('Regularization strength multiplier (alpha)')
ax.set_ylabel('Elastic net mixing parameter (l1_ratio)')
plt.tight_layout()
plt.savefig(cv_heatmap_file, dpi=600, bbox_inches='tight')
plt.close()

# Get predictions
y_predict_train = cv_pipeline.decision_function(x_train)
y_predict_test = cv_pipeline.decision_function(x_test)
metrics_train = get_threshold_metrics(y_train,
                                      y_predict_train,
                                      drop_intermediate=keep_inter)
metrics_test = get_threshold_metrics(y_test,
                                     y_predict_test,
                                     drop_intermediate=keep_inter)

# Rerun "cross validation" for the best hyperparameter set to define
# cross-validation disease-specific performance. Each sample prediction is
# based on the fold that the sample was in the testing partition
y_cv = cross_val_predict(cv_pipeline.best_estimator_,
                         X=x_train,
                         y=y_train,
                         cv=folds,
def evaluate_classifier(X,
                        y,
                        list_of_queries,
                        set_k_range, k_function,
                        alpha_range, 
                        l1_ratio):
    
    ''' Run a classifier setup on a set of queries.
    
        Loop through each query; train and test the classifier using the
        hyperparameters input as parameters; populate the metrics dictionary
        with some metrics of which parameters were selected and how well
        the classifier did for that query.
    '''
    
    # A dictionary to hold the performance metrics.
    metrics_dict = {}
    
    # Loop through each query; train and test the classifer; populate the metrics dictionary.
    for query in list_of_queries:
        num_samples = query[2]['total']
        num_positives = query[2]['positive']
        
        # Subset by gene.
        y_query = y[query[0]]
        # Subset by diseases.
        disease_cols = [col for col in covariates.columns if col.endswith(tuple(query[1]))]
        has_disease = covariates[disease_cols].max(axis=1) > 0
        covariates_query = covariates[has_disease]
        X_query = X[X.index.isin(covariates_query.index)]
        y_query = y_query[y_query.index.isin(covariates_query.index)]
                
        # Test Train split
        test_size = 0.2
        X_train, X_test, y_train, y_test = train_test_split(X_query, y_query, stratify=y_query, test_size=test_size, random_state=RANDOMSEED)
        # PCA.
        scaler = StandardScaler()
        if query[2]['total']*(1-test_size)*(1-(1/3)) > 350:
            n_comp = 350
        else:
            n_comp = int(query[2]['total']*(1-test_size) - 1)
        pca = PCA(n_components = n_comp, random_state = RANDOMSEED)
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        pca.fit(X_train_scaled)
        X_train = pca.transform(X_train_scaled)
        X_test_scaled = scaler.transform(X_test)
        X_test = pca.transform(X_test_scaled)
        
        if set_k_range:
            k_range = set_k_range
        else:
            k_range = k_function(num_samples=num_samples,
                                 num_positives=num_positives,
                                 )     
        # Parameter Sweep for Hyperparameters
        param_grid = {
            'select__k': k_range,
            'classify__loss': ['log'],
            'classify__penalty': ['elasticnet'],
            'classify__alpha': alpha_range,
            'classify__l1_ratio': l1_ratio,
        }
        pipeline = Pipeline(steps=[
            ('select', SelectKBest(variance_scorer)),
            ('classify', SGDClassifier(random_state=RANDOMSEED, class_weight='balanced'))
        ])
        cv_pipeline = GridSearchCV(estimator=pipeline, 
                                   param_grid=param_grid,
                                   n_jobs=1, 
                                   scoring='roc_auc')
        cv_pipeline.fit(X=X_train, y=y_train)
        y_pred_train = cv_pipeline.decision_function(X_train)
        y_pred_test = cv_pipeline.decision_function(X_test)
        # Get ROC info.
        def get_threshold_metrics(y_true, y_pred):
            roc_columns = ['fpr', 'tpr', 'threshold']
            roc_items = zip(roc_columns, roc_curve(y_true, y_pred))
            roc_df = pd.DataFrame.from_items(roc_items)
            auroc = roc_auc_score(y_true, y_pred)
            return {'auroc': auroc, 'roc_df': roc_df}
        metrics_train = get_threshold_metrics(y_train, y_pred_train)
        metrics_test = get_threshold_metrics(y_test, y_pred_test)

        # Populate the metrics dictionary.
        # Get metrics for the classifier.
        overfit = metrics_train['auroc'] - metrics_test['auroc']
        # Understand how the parameter grid worked... any params at the edge?
        if cv_pipeline.best_params_['select__k'] == min(param_grid['select__k']):
            n_comp_status = 'min'
        elif cv_pipeline.best_params_['select__k'] == max(param_grid['select__k']):
            n_comp_status = 'max'
        else:
            n_comp_status = 'OK'
        if cv_pipeline.best_params_['classify__alpha'] == min(param_grid['classify__alpha']):
            alpha_status = 'min'
        elif cv_pipeline.best_params_['classify__alpha'] == max(param_grid['classify__alpha']):
            alpha_status = 'max'
        else:
            alpha_status = 'OK'
        metrics = {'num_samples': num_samples,
                   'num_positive': num_positives,
                   'balance': num_positives/num_samples,
                   'train_auroc': metrics_train['auroc'], 
                   'test_auroc': metrics_test['auroc'],
                   'n_components': cv_pipeline.best_params_['select__k'], 
                   'alpha': cv_pipeline.best_params_['classify__alpha'],
                   'overfit': overfit,
                   'n_comp_status': n_comp_status,
                   'alpha_status': alpha_status
                  }
        # Add the metrics to the dictonary.
        metrics_dict[query[0]+str(query[2]['total'])] = metrics
    # Change the metrics dict into a formatted pandas dataframe.
    metrics_df = pd.DataFrame(metrics_dict)
    metrics_df = metrics_df.T
    metrics_df.sort_values(by='num_positive', ascending=True, inplace=True)
    metrics_df = metrics_df[['num_samples', 'num_positive', 'balance', 'n_components','n_comp_status', 'alpha', 'alpha_status','train_auroc', 'test_auroc', 'overfit']]
    
    return(metrics_df)
def evaluate_classifier(X_train, X_test, y, y_train_allgenes, y_test_allgenes,
                        list_of_genes, set_k_range, k_function, alpha_range,
                        l1_ratio):
    ''' Run a classifier setup on a set of queries.
    
        Loop through each query; train and test the classifier using the
        hyperparameters input as parameters; populate the metrics dictionary
        with some metrics of which parameters were selected and how well
        the classifier did for that query.
    '''

    # A dictionary to hold the performance metrics.
    metrics_dict = {}

    # Loop through each query; train and test the classifer; populate the metrics dictionary.
    for gene in list_of_genes:

        # Train and test the classifier.

        y_gene = y[gene]
        y_train = y_train_allgenes[gene]
        y_test = y_test_allgenes[gene]
        num_positives = int(y_gene.value_counts(True)[1] * len(y_gene))
        if set_k_range:
            k_range = set_k_range
        else:
            k_range = k_function(num_positives)
        # Parameter Sweep for Hyperparameters
        param_grid = {
            'select__k': k_range,
            'classify__loss': ['log'],
            'classify__penalty': ['elasticnet'],
            'classify__alpha': alpha_range,
            'classify__l1_ratio': l1_ratio,
        }
        pipeline = Pipeline(steps=[('select', SelectKBest(variance_scorer)),
                                   ('classify',
                                    SGDClassifier(random_state=RANDOMSEED,
                                                  class_weight='balanced'))])
        cv_pipeline = GridSearchCV(estimator=pipeline,
                                   param_grid=param_grid,
                                   n_jobs=1,
                                   scoring='roc_auc')
        cv_pipeline.fit(X=X_train, y=y_train)
        y_pred_train = cv_pipeline.decision_function(X_train)
        y_pred_test = cv_pipeline.decision_function(X_test)

        # Get ROC info.
        def get_threshold_metrics(y_true, y_pred):
            roc_columns = ['fpr', 'tpr', 'threshold']
            roc_items = zip(roc_columns, roc_curve(y_true, y_pred))
            roc_df = pd.DataFrame.from_items(roc_items)
            auroc = roc_auc_score(y_true, y_pred)
            return {'auroc': auroc, 'roc_df': roc_df}

        metrics_train = get_threshold_metrics(y_train, y_pred_train)
        metrics_test = get_threshold_metrics(y_test, y_pred_test)

        # Populate the metrics dictionary.

        # Get metrics for the classifier.
        overfit = metrics_train['auroc'] - metrics_test['auroc']
        # Understand how the parameter grid worked... any params at the edge?
        if cv_pipeline.best_params_['select__k'] == min(
                param_grid['select__k']):
            n_comp_status = 'min'
        elif cv_pipeline.best_params_['select__k'] == max(
                param_grid['select__k']):
            n_comp_status = 'max'
        else:
            n_comp_status = 'OK'
        if cv_pipeline.best_params_['classify__alpha'] == min(
                param_grid['classify__alpha']):
            alpha_status = 'min'
        elif cv_pipeline.best_params_['classify__alpha'] == max(
                param_grid['classify__alpha']):
            alpha_status = 'max'
        else:
            alpha_status = 'OK'
        metrics = {
            'num_positive': num_positives,
            'train_auroc': metrics_train['auroc'],
            'test_auroc': metrics_test['auroc'],
            'n_components': cv_pipeline.best_params_['select__k'],
            'alpha': cv_pipeline.best_params_['classify__alpha'],
            'overfit': overfit,
            'n_comp_status': n_comp_status,
            'alpha_status': alpha_status
        }
        # Add the metrics to the dictonary.
        metrics_dict[gene] = metrics
    # Change the metrics dict into a formatted pandas dataframe.
    metrics_df = pd.DataFrame(metrics_dict)
    metrics_df = metrics_df.T
    metrics_df.sort_values(by='num_positive', ascending=True, inplace=True)
    metrics_df = metrics_df[[
        'num_positive', 'n_components', 'n_comp_status', 'alpha',
        'alpha_status', 'train_auroc', 'test_auroc', 'overfit'
    ]]

    return (metrics_df)
def evaluate_classifier(X_train, X_test,
                        y, y_train_allgenes, y_test_allgenes,
                        list_of_genes,
                        set_k_range, k_function,
                        alpha_range, 
                        l1_ratio):
    
    ''' Run a classifier setup on a set of queries.
    
        Loop through each query; train and test the classifier using the
        hyperparameters input as parameters; populate the metrics dictionary
        with some metrics of which parameters were selected and how well
        the classifier did for that query.
    '''
    
    # A dictionary to hold the performance metrics.
    metrics_dict = {}
    
    # Loop through each query; train and test the classifer; populate the metrics dictionary.
    for gene in list_of_genes:
        
        # Train and test the classifier.
        
        y_gene = y[gene]
        y_train = y_train_allgenes[gene]
        y_test = y_test_allgenes[gene]
        num_positives = int(y_gene.value_counts(True)[1]*len(y_gene))
        if set_k_range:
            k_range = set_k_range
        else:
            k_range = k_function(num_positives)     
        # Parameter Sweep for Hyperparameters
        param_grid = {
            'select__k': k_range,
            'classify__loss': ['log'],
            'classify__penalty': ['elasticnet'],
            'classify__alpha': alpha_range,
            'classify__l1_ratio': l1_ratio,
        }
        pipeline = Pipeline(steps=[
            ('select', SelectKBest(variance_scorer)),
            ('classify', SGDClassifier(random_state=RANDOMSEED, class_weight='balanced'))
        ])
        cv_pipeline = GridSearchCV(estimator=pipeline, 
                                   param_grid=param_grid,
                                   n_jobs=1, 
                                   scoring='roc_auc')
        cv_pipeline.fit(X=X_train, y=y_train)
        y_pred_train = cv_pipeline.decision_function(X_train)
        y_pred_test = cv_pipeline.decision_function(X_test)
        # Get ROC info.
        def get_threshold_metrics(y_true, y_pred):
            roc_columns = ['fpr', 'tpr', 'threshold']
            roc_items = zip(roc_columns, roc_curve(y_true, y_pred))
            roc_df = pd.DataFrame.from_items(roc_items)
            auroc = roc_auc_score(y_true, y_pred)
            return {'auroc': auroc, 'roc_df': roc_df}
        metrics_train = get_threshold_metrics(y_train, y_pred_train)
        metrics_test = get_threshold_metrics(y_test, y_pred_test)

        # Populate the metrics dictionary.

        # Get metrics for the classifier.
        overfit = metrics_train['auroc'] - metrics_test['auroc']
        # Understand how the parameter grid worked... any params at the edge?
        if cv_pipeline.best_params_['select__k'] == min(param_grid['select__k']):
            n_comp_status = 'min'
        elif cv_pipeline.best_params_['select__k'] == max(param_grid['select__k']):
            n_comp_status = 'max'
        else:
            n_comp_status = 'OK'
        if cv_pipeline.best_params_['classify__alpha'] == min(param_grid['classify__alpha']):
            alpha_status = 'min'
        elif cv_pipeline.best_params_['classify__alpha'] == max(param_grid['classify__alpha']):
            alpha_status = 'max'
        else:
            alpha_status = 'OK'
        metrics = {'num_positive': num_positives,
                   'train_auroc': metrics_train['auroc'], 
                   'test_auroc': metrics_test['auroc'],
                   'n_components': cv_pipeline.best_params_['select__k'], 
                   'alpha': cv_pipeline.best_params_['classify__alpha'],
                   'overfit': overfit,
                   'n_comp_status': n_comp_status,
                   'alpha_status': alpha_status
                  }
        # Add the metrics to the dictonary.
        metrics_dict[gene] = metrics
    # Change the metrics dict into a formatted pandas dataframe.
    metrics_df = pd.DataFrame(metrics_dict)
    metrics_df = metrics_df.T
    metrics_df.sort_values(by='num_positive', ascending=True, inplace=True)
    metrics_df = metrics_df[['num_positive', 'n_components','n_comp_status', 'alpha', 'alpha_status','train_auroc', 'test_auroc', 'overfit']]
    
    return(metrics_df)
                             scoring='roc_auc')
cv_pipeline10MAD = GridSearchCV(estimator=pipeline,
                                param_grid=param_grid,
                                n_jobs=1,
                                scoring='roc_auc')

# In[28]:

get_ipython().run_cell_magic(
    'time', '',
    'cv_pipeline10.fit(X=X10_train, y=y_train)\ncv_pipeline30.fit(X=X30_train, y=y_train)\ncv_pipeline10MAD.fit(X=X10MAD_train, y=y_train)'
)

# In[29]:

y10_pred_train = cv_pipeline10.decision_function(X10_train)
y10_pred_test = cv_pipeline10.decision_function(X10_test)

y30_pred_train = cv_pipeline30.decision_function(X30_train)
y30_pred_test = cv_pipeline30.decision_function(X30_test)

y10MAD_pred_train = cv_pipeline10MAD.decision_function(X10MAD_train)
y10MAD_pred_test = cv_pipeline10MAD.decision_function(X10MAD_test)


def get_threshold_metrics(y_true, y_pred, tissue='all'):
    roc_columns = ['fpr', 'tpr', 'threshold']
    roc_items = zip(roc_columns, roc_curve(y_true, y_pred))
    roc_df = pd.DataFrame.from_items(roc_items)
    auroc = roc_auc_score(y_true, y_pred)
    return {'auroc': auroc, 'roc_df': roc_df, 'tissue': tissue}
def evaluate_classifier(X, y, list_of_queries, set_k_range, k_function,
                        alpha_range, l1_ratio):
    ''' Run a classifier setup on a set of queries.
    
        Loop through each query; train and test the classifier using the
        hyperparameters input as parameters; populate the metrics dictionary
        with some metrics of which parameters were selected and how well
        the classifier did for that query.
    '''

    # A dictionary to hold the performance metrics.
    metrics_dict = {}

    # Loop through each query; train and test the classifer; populate the metrics dictionary.
    for query in list_of_queries:
        num_samples = query[2]['total']
        num_positives = query[2]['positive']

        # Subset by gene.
        y_query = y[query[0]]
        # Subset by diseases.
        disease_cols = [
            col for col in covariates.columns if col.endswith(tuple(query[1]))
        ]
        has_disease = covariates[disease_cols].max(axis=1) > 0
        covariates_query = covariates[has_disease]
        X_query = X[X.index.isin(covariates_query.index)]
        y_query = y_query[y_query.index.isin(covariates_query.index)]

        # Test Train split
        test_size = 0.2
        X_train, X_test, y_train, y_test = train_test_split(
            X_query,
            y_query,
            stratify=y_query,
            test_size=test_size,
            random_state=RANDOMSEED)
        # PCA.
        scaler = StandardScaler()
        if query[2]['total'] * (1 - test_size) * (1 - (1 / 3)) > 350:
            n_comp = 350
        else:
            n_comp = int(query[2]['total'] * (1 - test_size) - 1)
        pca = PCA(n_components=n_comp, random_state=RANDOMSEED)
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        pca.fit(X_train_scaled)
        X_train = pca.transform(X_train_scaled)
        X_test_scaled = scaler.transform(X_test)
        X_test = pca.transform(X_test_scaled)

        if set_k_range:
            k_range = set_k_range
        else:
            k_range = k_function(
                num_samples=num_samples,
                num_positives=num_positives,
            )
        # Parameter Sweep for Hyperparameters
        param_grid = {
            'select__k': k_range,
            'classify__loss': ['log'],
            'classify__penalty': ['elasticnet'],
            'classify__alpha': alpha_range,
            'classify__l1_ratio': l1_ratio,
        }
        pipeline = Pipeline(steps=[('select', SelectKBest(variance_scorer)),
                                   ('classify',
                                    SGDClassifier(random_state=RANDOMSEED,
                                                  class_weight='balanced'))])
        cv_pipeline = GridSearchCV(estimator=pipeline,
                                   param_grid=param_grid,
                                   n_jobs=1,
                                   scoring='roc_auc')
        cv_pipeline.fit(X=X_train, y=y_train)
        y_pred_train = cv_pipeline.decision_function(X_train)
        y_pred_test = cv_pipeline.decision_function(X_test)

        # Get ROC info.
        def get_threshold_metrics(y_true, y_pred):
            roc_columns = ['fpr', 'tpr', 'threshold']
            roc_items = zip(roc_columns, roc_curve(y_true, y_pred))
            roc_df = pd.DataFrame.from_items(roc_items)
            auroc = roc_auc_score(y_true, y_pred)
            return {'auroc': auroc, 'roc_df': roc_df}

        metrics_train = get_threshold_metrics(y_train, y_pred_train)
        metrics_test = get_threshold_metrics(y_test, y_pred_test)

        # Populate the metrics dictionary.
        # Get metrics for the classifier.
        overfit = metrics_train['auroc'] - metrics_test['auroc']
        # Understand how the parameter grid worked... any params at the edge?
        if cv_pipeline.best_params_['select__k'] == min(
                param_grid['select__k']):
            n_comp_status = 'min'
        elif cv_pipeline.best_params_['select__k'] == max(
                param_grid['select__k']):
            n_comp_status = 'max'
        else:
            n_comp_status = 'OK'
        if cv_pipeline.best_params_['classify__alpha'] == min(
                param_grid['classify__alpha']):
            alpha_status = 'min'
        elif cv_pipeline.best_params_['classify__alpha'] == max(
                param_grid['classify__alpha']):
            alpha_status = 'max'
        else:
            alpha_status = 'OK'
        metrics = {
            'num_samples': num_samples,
            'num_positive': num_positives,
            'balance': num_positives / num_samples,
            'train_auroc': metrics_train['auroc'],
            'test_auroc': metrics_test['auroc'],
            'n_components': cv_pipeline.best_params_['select__k'],
            'alpha': cv_pipeline.best_params_['classify__alpha'],
            'overfit': overfit,
            'n_comp_status': n_comp_status,
            'alpha_status': alpha_status
        }
        # Add the metrics to the dictonary.
        metrics_dict[query[0] + str(query[2]['total'])] = metrics
    # Change the metrics dict into a formatted pandas dataframe.
    metrics_df = pd.DataFrame(metrics_dict)
    metrics_df = metrics_df.T
    metrics_df.sort_values(by='num_positive', ascending=True, inplace=True)
    metrics_df = metrics_df[[
        'num_samples', 'num_positive', 'balance', 'n_components',
        'n_comp_status', 'alpha', 'alpha_status', 'train_auroc', 'test_auroc',
        'overfit'
    ]]

    return (metrics_df)
Exemplo n.º 7
0
# Cross-validated performance heatmap
cv_score_mat = pd.pivot_table(cv_results,
                              values='mean_test_score',
                              index='classify__l1_ratio',
                              columns='classify__alpha')
ax = sns.heatmap(cv_score_mat, annot=True, fmt='.1%')
ax.set_xlabel('Regularization strength multiplier (alpha)')
ax.set_ylabel('Elastic net mixing parameter (l1_ratio)')
plt.tight_layout()
plt.savefig(cv_heatmap_file, dpi=600, bbox_inches='tight')

# ## Generate Predictions

# In[14]:

y_predict_train = cv_pipeline.decision_function(x_train_df)
y_predict_test = cv_pipeline.decision_function(x_test_df)

y_predict_shuffled_train = shuffle_cv_pipeline.decision_function(
    x_train_shuffled_df)
y_predict_shuffled_test = shuffle_cv_pipeline.decision_function(x_test_df)

# In[15]:

y_test_meta_df = (y_test_df.assign(
    predicted=y_predict_test,
    predicted_shuffle=y_predict_shuffled_test).merge(meta_test_df,
                                                     left_index=True,
                                                     right_index=True))
y_train_meta_df = (y_train_df.assign(
    predicted=y_predict_train,