Exemplo n.º 1
0
class Model():

    def __init__(self):
        self.knn = KNeighborsTransformer(n_neighbors=5,n_jobs=-1)

    def fit(self, dtm):
        self.knn.fit(dtm, dtm.index.tolist())

    def predict(self):
        pass
Exemplo n.º 2
0
def weighted_knn(train_adata,
                 valid_adata,
                 label_key,
                 n_neighbors=50,
                 threshold=0.5,
                 pred_unknown=True):
    """Annotates ``valid_adata`` cells with a trained weighted KNN classifier on ``train_adata``.

        Parameters
        ----------
        train_adata: :class:`~anndata.AnnData`
            Annotated dataset to be used to train KNN classifier with ``label_key`` as the target variable.
        valid_adata: :class:`~anndata.AnnData`
            Annotated dataset to be used to validate KNN classifier.
        label_key: str
            Name of the column to be used as target variable (e.g. cell_type) in ``train_adata`` and ``valid_adata``.
        n_neighbors: int
            Number of nearest neighbors in KNN classifier.
        threshold: float
            Threshold of uncertainty used to annotating cells as "Unknown". cells with uncertainties upper than this
             value will be annotated as "Unknown".
        pred_unknown: bool
            ``True`` by default. Whether to annotate any cell as "unknown" or not. If `False`, will not use
            ``threshold`` and annotate each cell with the label which is the most common in its
            ``n_neighbors`` nearest cells.
    """
    print(
        f'Weighted KNN with n_neighbors = {n_neighbors} and threshold = {threshold} ... ',
        end='')
    k_neighbors_transformer = KNeighborsTransformer(n_neighbors=n_neighbors,
                                                    mode='distance',
                                                    algorithm='brute',
                                                    metric='euclidean',
                                                    n_jobs=-1)
    k_neighbors_transformer.fit(train_adata.X)

    y_train_labels = train_adata.obs[label_key].values
    y_valid_labels = valid_adata.obs[label_key].values

    top_k_distances, top_k_indices = k_neighbors_transformer.kneighbors(
        X=valid_adata.X)

    stds = np.std(top_k_distances, axis=1)
    stds = (2. / stds)**2
    stds = stds.reshape(-1, 1)

    top_k_distances_tilda = np.exp(-np.true_divide(top_k_distances, stds))

    weights = top_k_distances_tilda / np.sum(
        top_k_distances_tilda, axis=1, keepdims=True)

    uncertainties = []
    pred_labels = []
    for i in range(len(weights)):
        unique_labels = np.unique(y_train_labels[top_k_indices[i]])
        best_label, best_prob = None, 0.0
        for candidate_label in unique_labels:
            candidate_prob = weights[i, y_train_labels[top_k_indices[i]] ==
                                     candidate_label].sum()
            if best_prob < candidate_prob:
                best_prob = candidate_prob
                best_label = candidate_label

        if pred_unknown:
            if best_prob >= threshold:
                pred_label = best_label
            else:
                pred_label = 'Unknown'
        else:
            pred_label = best_label

        if pred_label == y_valid_labels[i]:
            uncertainties.append(max(1 - best_prob, 0))
        else:
            true_prob = weights[i, y_train_labels[top_k_indices[i]] ==
                                y_valid_labels[i]].sum()
            if true_prob > 0.5:
                pass
            uncertainties.append(max(1 - true_prob, 0))

        pred_labels.append(pred_label)

    pred_labels = np.array(pred_labels).reshape(-1, )
    uncertainties = np.array(uncertainties).reshape(-1, )

    labels_eval = pred_labels == y_valid_labels
    labels_eval = labels_eval.astype(object)

    n_correct = len(labels_eval[labels_eval == True])
    n_incorrect = len(labels_eval[labels_eval == False]) - len(
        labels_eval[pred_labels == 'Unknown'])
    n_unknown = len(labels_eval[pred_labels == 'Unknown'])

    labels_eval[labels_eval == True] = f'Correct'
    labels_eval[labels_eval == False] = f'InCorrect'
    labels_eval[pred_labels == 'Unknown'] = f'Unknown'

    valid_adata.obs['uncertainty'] = uncertainties
    valid_adata.obs[f'pred_{label_key}'] = pred_labels
    valid_adata.obs['evaluation'] = labels_eval

    print('finished!')
    print(f"Number of correctly classified samples: {n_correct}")
    print(f"Number of misclassified samples: {n_incorrect}")
    print(f"Number of samples classified as unknown: {n_unknown}")
Exemplo n.º 3
0
        """A function to take a feature and tokenize then return a tfidf df of that input
        """
        self.tokenizer.fit_on_texts(feature)
        a = self.tokenizer.texts_to_matrix(feature, mode='tfidf')
        config = self.tokenizer.get_config()
        feature_names = json_normalize(loads(
            config['word_index'])).columns.tolist()
        dtm = pd.DataFrame(a)
        return dtm


if __name__ == "__main__":
    tr = Transformer()
    negative = ['negative']
    ignore = []
    user_transformed, y = tr.transform(
        pd.DataFrame({
            'name':
            "blue berry kush",
            'race':
            'sativa',
            'flavors': ['blueberry', 'sweet'],
            'negative': ['dry mouth', 'dry eyes'],
            'positive': ['creativity', 'stress'],
            'medical': ['ptsd', 'stress'],
            'description':
            "blueberry kush my dude blueberry_kush:10, whitewhidow:10 ",
        }), negative, ignore)
    model = KNeighborsTransformer()
    model.fit()
Exemplo n.º 4
0
def weighted_knn(train_adata,
                 valid_adata,
                 label_key,
                 n_neighbors=50,
                 threshold=0.5,
                 pred_unknown=True,
                 return_uncertainty=True):
    """
    Taken from scnet:
    https://github.com/theislab/scarches/blob/e84cfa5cf361bb22fd70865cb1f398af72248684/scnet/utils.py
    """
    print(
        f'Weighted KNN with n_neighbors = {n_neighbors} and threshold = {threshold} ... ',
        end='')
    k_neighbors_transformer = KNeighborsTransformer(n_neighbors=n_neighbors,
                                                    mode='distance',
                                                    algorithm='brute',
                                                    metric='euclidean',
                                                    n_jobs=-1)
    train_adata = remove_sparsity(train_adata)
    valid_adata = remove_sparsity(valid_adata)

    k_neighbors_transformer.fit(train_adata.X)

    y_train_labels = train_adata.obs[label_key].values
    y_valid_labels = valid_adata.obs[label_key].values

    top_k_distances, top_k_indices = k_neighbors_transformer.kneighbors(
        X=valid_adata.X)

    stds = np.std(top_k_distances, axis=1)
    stds = (2. / stds)**2
    stds = stds.reshape(-1, 1)

    top_k_distances_tilda = np.exp(-np.true_divide(top_k_distances, stds))

    weights = top_k_distances_tilda / np.sum(
        top_k_distances_tilda, axis=1, keepdims=True)

    uncertainties = []
    pred_labels = []
    for i in range(len(weights)):
        # labels = y_train_labels[top_k_indices[i]]
        most_common_label, _ = Counter(
            y_train_labels[top_k_indices[i]]).most_common(n=1)[0]
        most_prob = weights[i, y_train_labels[top_k_indices[i]] ==
                            most_common_label].sum()
        if pred_unknown:
            if most_prob >= threshold:
                pred_label = most_common_label
            else:
                pred_label = 'Unknown'
        else:
            pred_label = most_common_label

        if pred_label == y_valid_labels[i]:
            uncertainties.append(1 - most_prob)
        else:
            true_prob = weights[i, y_train_labels[top_k_indices[i]] ==
                                y_valid_labels[i]].sum()
            uncertainties.append(1 - true_prob)

        pred_labels.append(pred_label)

    pred_labels = np.array(pred_labels).reshape(-1, 1)
    uncertainties = np.array(uncertainties).reshape(-1, 1)

    print('finished!')
    if return_uncertainty:
        return pred_labels, uncertainties
    else:
        return pred_labels