def _one_fit(X,
                 y,
                 group,
                 train_index,
                 test_index,
                 estimator,
                 scale_function=None):
        # Normalize
        scaler = scalingClass(scaling=scale_function)
        X_train = scaler.fit_transform(X[train_index])
        X_test = scaler.transform(X[test_index])

        # Train classifier
        estimator.fit(X_train, y[train_index])

        # Predict
        y_pred = estimator.predict(X_test)
        if hasattr(estimator, 'predict_proba'):
            probas = estimator.predict_proba(X_test)
        elif hasattr(estimator, 'decision_function'):
            probas = estimator.decision_function(X_test)
        elif hasattr(estimator,
                     'oob_decision_function') and estimator.oob_score:
            probas = estimator.oob_decision_function(X_test)
        else:
            probas = None
        score = accuracy_score(y[test_index], y_pred)
        score_maj = score_majority_vote(y[test_index],
                                        group[test_index],
                                        y_pred=y_pred,
                                        probas=probas,
                                        labels=estimator.classes_,
                                        vote_type=sum_rule)
        return score, score_maj
Пример #2
0
    def _one_fit(X, y, group, train_index, test_index, estimator,
                 sample_weight, scale_function=None):
        # Normalize
        scaler = scalingClass(scaling=scale_function)
        X_train = scaler.fit_transform(X[train_index])
        X_test = scaler.transform(X[test_index])

        # Train classifier
        if sample_weight is None:
            estimator.fit(X_train, y[train_index])
        else:
            estimator.fit(X_train, y[train_index], sample_weight=sample_weight[train_index])

        # Predict
        y_pred = estimator.predict(X_test)
        assert all(estimator.classes_ == np.unique(y)), \
            'Not all classes are represented in the folds.'
        if hasattr(estimator, 'predict_proba'):
            y_probas = estimator.predict_proba(X_test)
        elif hasattr(estimator, 'decision_function'):
            y_probas = estimator.decision_function(X_test)
        elif hasattr(estimator, 'oob_decision_function') and estimator.oob_score:
            y_probas = estimator.oob_decision_function(X_test)
        else:
            y_probas = None
        return test_index, y_pred, y_probas, estimator
def _majority_vote_CV(X, y, group, estimator, splitter, scale_function=None):

    ## Majority vote
    #---------------
    X = np.array(X)
    y = np.array(y)
    group = np.array(group)

    scores = []
    scores_maj = []
    for train_index, test_index in splitter.split(X, y, group):

        # Normalize
        scaler = scalingClass(scaling=scale_function)
        X_train = scaler.fit_transform(X[train_index])
        X_test = scaler.transform(X[test_index])

        # Train classifier
        estimator.fit(X_train, y[train_index])

        # Predict
        y_pred = estimator.predict(X_test)
        scores.append(accuracy_score(y[test_index], y_pred))
        scores_maj.append(score_majority_vote(y[test_index],
                                              group[test_index]),
                          y_pred=y_pred)

    return scores, scores_maj
Пример #4
0
def cv_predict_single(X,
                      y,
                      splitter,
                      estimator,
                      group=None,
                      scale_function=None,
                      sample_weight=None):

    labels = np.unique(y)
    X = np.array(X)
    y = np.array(y)

    if sample_weight is not None:
        sample_weight = np.array(sample_weight)

    pred = np.empty_like(y)
    probas = np.empty((X.shape[0], labels.shape[0]))

    test_folds = []
    trained_estimators = []
    for train_index, test_index in splitter.split(X, y, group):

        test_folds.append(test_index)

        # Normalize
        scaler = scalingClass(scaling=scale_function)
        X_train = scaler.fit_transform(X[train_index])
        X_test = scaler.transform(X[test_index])

        # Train classifier
        if sample_weight is not None:
            estimator.fit(X_train, y[train_index], sample_weight[train_index])
        else:
            estimator.fit(X_train, y[train_index])
        trained_estimators.append(estimator)

        # Predict
        pred[test_index] = estimator.predict(X_test)
        assert all(estimator.classes_ == labels), \
            'Not all classes are represented in the folds.'

        if hasattr(estimator, 'predict_proba'):
            probas[test_index] = estimator.predict_proba(X_test)
        elif hasattr(estimator, 'decision_function'):
            probas[test_index] = estimator.decision_function(X_test)
        elif hasattr(estimator,
                     'oob_decision_function') and estimator.oob_score:
            probas[test_index] = estimator.oob_decision_function(X_test)
        else:
            probas = None

    return pred, probas, labels, test_folds, trained_estimators
    def scaler(self, scalingtype):

        if isinstance(scalingtype, str) or scalingtype is None:
            scalerinstance = scalingClass(scaling=scalingtype)
        elif hasattr(scalingtype, 'fit'):
            scalerinstance = scalingtype
        else:
            raise ValueError(
                'Scaling parameter type not recognised. Valid parameter types '
                +
                'include strings \'minmax_scale\', \'standardize\' and \'normalize\' '
                + 'and instances of scaling classes with a fit() method.')

        return scalerinstance
scorer = ['accuracy', ClassifScorer(scorer='f1', average='macro')]
scorenames = ['accuracy', 'f1']

# Feature set
root_classif = Path(ANALYSIS_DIR) / 'classification' / 'results'
path_to_set = root_classif / 'feature_selection_results' / 'best_feature_set.csv'
feat_set = pd.read_csv(path_to_set, header=None)[0].to_numpy()

# Base estimator pipeline
# (estimator used to classify tierpsy feature vectors to modes of action)
path_to_params = root_classif / 'optimize_hyperparameters_results' / 'best_params.p'
params = pickle.load(open(path_to_params, 'rb'))

estimator = LogisticRegression(**params)

pipe = Pipeline([('scaler', scalingClass(scaling=scale)),
                 ('estimator', estimator)])

# Ensemble of binary estimators pipeline
# (estimators used to classify compounds in known-novel based on theta scores)
svc = SVC(C=1, kernel='linear', class_weight='balanced', gamma='auto')
binary_pipe = Pipeline([('scaler', StandardScaler()), ('svc', svc)])

saveto = Path().cwd() / 'results'
saveto.mkdir(parents=True, exist_ok=True)

#%% Read data
feat = pd.read_csv(data_file, usecols=feat_set)
y = pd.read_csv(meta_file, usecols=['MOA_group']).values.flatten()
group = pd.read_csv(meta_file, usecols=['drug_type']).values.flatten()
Пример #7
0
def k_significant_from_classifier(feat,
                                  y_class,
                                  estimator,
                                  k=5,
                                  scale=None,
                                  feat_names=None,
                                  k_to_plot=None,
                                  close_after_plotting=False,
                                  saveto=None,
                                  figsize=None,
                                  title=None,
                                  xlabel=None):
    """
    param:
        feat: array-like, shape=(n_samlples, n_features)
            The feature matrix
        y_class: array-like, shape=(n_samples)
            Vector with the class of each samples
        estimator: object
            A supervised learning estimator with a fit method that provides
            information about feature importance either through a coef_
            attribute or through a feature_importances_ attribute.
        k: integer or 'all', optional
            Number of fetures to select
            Default is 5.
        scale: None, str or function, optional
            If string 'standardize', 'minmax_scale', the
            tierpsytools.preprocessing.scaling_class.scalingClass is used
            to scale the features.
            Otherwise the used can input a function that scales features.
            Default is None (no scaling).
        feat_names: list shape=(n_features)
            The names of the features, when feat is an array and not a dataframe
            (will be used for plotting)
        plot: boolean
            If True, the boxplots of the chosen features will be plotted

    return:
        top_feat: list, shape=(k,)
            The names or indexes (if feature names are not given) of the top features,
            sorted by importance.
        scores: array-like, shape=(k,)
            The scores of the top features, sorted by importance.
        support: array of booleans, shape=(n_features,)
            True for the selected features, False for the rest
        plot
    """
    from tierpsytools.preprocessing.scaling_class import scalingClass

    if k_to_plot is None:
        plot = False
    else:
        plot = True

    if isinstance(feat, np.ndarray):
        feat = pd.DataFrame(feat, columns=feat_names)

    if isinstance(k, str):
        if k == 'all':
            k = feat.shape[1]

    if scale is not None:
        if isinstance(scale, str):
            scaler = scalingClass(scaling=scale)
            feat_scaled = scaler.fit_transform(feat)
        else:
            feat_scaled = scale(feat)
    else:
        feat_scaled = feat

    estimator.fit(feat_scaled, y_class)
    if hasattr(estimator, 'coef_'):
        scores = np.linalg.norm(estimator.coef_, axis=0, ord=1)
    elif hasattr(estimator, 'feture_importances_'):
        scores = estimator.feture_importances_
    else:
        raise ValueError(
            'The chosen estimator does not have a coef_ attribute' +
            ' or a feature_importances_ attribute.')

    top_ft_ids = np.flip(np.argsort(scores))[:k]
    support = np.zeros(feat.shape[1]).astype(bool)
    support[top_ft_ids] = True
    scores = scores[top_ft_ids]

    # Plot a boxplot for each feature, showing its distribution in each class
    if plot:
        plot_feature_boxplots(feat.iloc[:, top_ft_ids[:k_to_plot]],
                              y_class,
                              scores,
                              figsize=figsize,
                              saveto=saveto,
                              xlabel=xlabel,
                              close_after_plotting=close_after_plotting)

    top_feat = feat.columns[top_ft_ids].to_list()
    return top_feat, scores, support
Пример #8
0
def k_significant_feat(feat,
                       y_class,
                       k=5,
                       score_func='f_classif',
                       scale=None,
                       feat_names=None,
                       plot=True,
                       k_to_plot=None,
                       close_after_plotting=False,
                       saveto=None,
                       figsize=None,
                       title=None,
                       xlabel=None):
    """
    Finds the k most significant features in the feature matrix, based on
    how well they separate the data in groups defined in y_class. It uses
    univariate statistical tests (the type of test is specified in the variable
    score_func).
    param:
        feat: array-like, shape=(n_samlples, n_features)
            The feature matrix
        y_class: array-like, shape=(n_samples)
            Vector with the class of each samples
        k: integer or 'all'
            Number of fetures to select
        score_func: str or function, optional
            If string 'f_classif', 'chi2', 'mutual_info_classif' then the
            function f_classif, chi2 or mutual_info_classif
            from sklearn.feature_selection will be used.
            Otherwise, the user needs to input a function that takes two
            arrays X and y, and returns a pair of arrays (scores, pvalues)
            or a single array with scores.
            Default is 'f_classif'.
        scale: None, str or function, optional
            If string 'standardize', 'minmax_scale', the
            tierpsytools.preprocessing.scaling_class.scalingClass is used
            to scale the features.
            Otherwise the used can input a function that scales features.
            Default is None (no scaling).
        feat_names: list shape=(n_features)
            The names of the features, when feat is an array and not a dataframe
            (will be used for plotting)

    return:
        support: array of booleans
            True for the selected features, False for the rest
        plot: boolean
            If True, the boxplots of the chosen features will be plotted
        plot
    """
    from sklearn.feature_selection import \
        SelectKBest, chi2,f_classif, mutual_info_classif

    if plot and k_to_plot is None:
        k_to_plot = k

    if isinstance(feat, np.ndarray):
        feat = pd.DataFrame(feat, columns=feat_names)
    feat = feat.loc[:, feat.std() != 0]

    if isinstance(k, str):
        if k == 'all':
            k = feat.shape[1]
        else:
            raise Exception('Data type for k not recognized.')

    # Find most significant features
    if isinstance(score_func, str):
        if score_func == 'f_classif':
            score_func = f_classif
        elif score_func == 'chi2':
            score_func = chi2
        elif score_func == 'mutual_info_classif':
            score_func = mutual_info_classif

    if scale is not None:
        if isinstance(scale, str):
            scaler = scalingClass(scaling=scale)
            feat_scaled = scaler.fit_transform(feat)
        else:
            feat_scaled = scale(feat)
    else:
        feat_scaled = feat

    skb = SelectKBest(score_func=score_func, k=k)
    skb.fit(feat_scaled, y_class)

    support = skb.get_support()
    sorted_scores = np.sort(skb.scores_)
    ids_sorted_scores = np.argsort(skb.scores_)
    top_ft_ids = np.flip(ids_sorted_scores[~np.isnan(sorted_scores)])[:k]
    scores = skb.scores_[top_ft_ids]
    if hasattr(skb, 'pvalues_'):
        pvalues = skb.pvalues_[top_ft_ids]
    else:
        pvalues = None

    # Plot a boxplot for each feature, showing its distribution in each class
    if plot:
        plot_feature_boxplots(feat.iloc[:, top_ft_ids[:k_to_plot]],
                              y_class,
                              scores,
                              pvalues=pvalues,
                              figsize=figsize,
                              saveto=saveto,
                              xlabel=xlabel,
                              close_after_plotting=close_after_plotting)

    if pvalues is not None:
        return feat.columns[top_ft_ids].to_list(), (scores, pvalues), support
    else:
        return feat.columns[top_ft_ids].to_list(), scores, support
Пример #9
0
from sklearn.linear_model import LogisticRegression
from tierpsytools.analysis.cv_splitters import StratifiedGroupKFold
from tierpsytools.analysis.scorers import ClassifScorer
from feat_selection import main_feature_selection
from optimize_hyperparameters import main_optimize_hyperparameters
from predict_test_set import main_predict_test_set
from moaclassification import INPUT_DIR
import pdb

#%%
# Input parameters
align_blue = True
balance_classes = True
n_average = 20
scale = 'rescale1'
scaler = scalingClass(scaling=scale)

# Estimator parameters
estimator = LogisticRegression(penalty='elasticnet',
                               l1_ratio=0.5,
                               C=1,
                               solver='saga',
                               multi_class='multinomial',
                               max_iter=500)

pipe = Pipeline([('scaler', scaler), ('estimator', estimator)])

# CV parameters
n_folds = 4
cv = StratifiedGroupKFold(n_splits=n_folds, random_seed=724)
vote_type = 'counts'
Пример #10
0
feat, meta = filter_n_skeletons(feat,
                                meta,
                                min_nskel_per_video=2000,
                                min_nskel_sum=None)

# Filter rows based on percentage of nan values
feat = filter_nan_inf(feat, 0.2, 1)
meta = meta.loc[feat.index]

# Filter features based on percentage of nan values
feat = filter_nan_inf(feat, 0.05, 0)

#%% Preprocess data
# Impute the remainig nan values
feat = impute_nan_inf(feat, groupby=None)

# Scale the features (necessary if tou want to do PCA)
scaler = scalingClass(scaling='standardize')
feat = scaler.fit_transform(feat)

#%% Check day-to-day variation
# Get the PCA decomposition
pca = PCA(n_components=2)
Y = pca.fit_transform(feat)

# Plot the samples of each day in the first two PCs
plt.figure()
for day in meta['date_yyyymmdd'].unique():
    plt.scatter(*Y[meta['date_yyyymmdd'] == day, :].T, label=day)
plt.legend()
Пример #11
0
#%% Keep only the moas of interest
# mask = meta['drug_type'].isin(DRUGS)
# meta = meta[mask]
# feat = feat[mask]

# make mapper to get drug names
meta['name_'] = meta[['MOA_group', 'drug_name'
                      ]].apply(lambda x: ' - '.join(map(str, x.values)),
                               axis=1)
namemapper = dict(meta[['drug_type', 'name_']].values)

moamapper = dict(meta[['MOA_group', 'MOA_general'
                       ]].drop_duplicates(subset=['MOA_group']).values)

#%% Scale features
scaler = scalingClass()

feat = scaler.fit_transform(feat)
sfeat = scaler.fit_transform(sfeat)

#%% PCA colored by moa
pca = PCA(n_components=3)
Y = pca.fit_transform(feat)
sY = pca.transform(sfeat)

fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111, projection='3d')
ax.set_title('original data')
#    ax.set_aspect('equal')
for moa, c in zip(moas_to_plot, colors):
    mask = meta['MOA_group'].isin([moa]).values