def activations_tsne_plot(activations, labels, ds):
    """Compute embeddings using t-SNE and plot them."""
    tsne = TSNE(
        perplexity=30,
        metric="euclidean",
        n_jobs=8,
        random_state=42,
        verbose=False,
    )
    fig, axes = plt.subplots(nrows=1, ncols=len(activations), figsize=(25,5))

    embs = []
    for idx, acts in enumerate(activations):
        print("Learning embeddings for layer " + str(idx) + "...")
        embeddings = tsne.fit(acts)

        for i,actual_label in enumerate(ds.classes):
            indices = np.argwhere(labels == i)
            indices = np.squeeze(indices)
            
            axes[idx].scatter(embeddings[indices,0],embeddings[indices,1],label=actual_label,s=2)
            axes[idx].legend()
            axes[idx].set_title("Activations in layer " + str(idx))
            
        embs.append(embeddings)

    fig.tight_layout()
    return embs
예제 #2
0
파일: TSNEO.py 프로젝트: ivarvb/LSP
 def execute(self):
     X = self.X
     X = np.array(self.X)
     #print("XXn",len(X))
     X2 = TSNE(n_components=self.p, random_state=7, perplexity=33).fit(X)
     print("X2fit", X2)
     return X2.tolist()
예제 #3
0
def plot_tsne(source_data, source_name, target_data, target_name,
              plot_directory):
    fig, ax = plt.subplots()
    perplexities = [100]
    for i, perplexity in enumerate(perplexities):
        tsne = TSNE(n_components=2,
                    initialization='pca',
                    random_state=0,
                    perplexity=perplexity,
                    n_iter=1000,
                    neighbors='approx')
        x_source_transformed = tsne.fit(source_data)
        x_target_transformed = tsne.fit(target_data)
        ax.set_title('Perplexity=%d' % perplexity)
        ax.scatter(x_source_transformed[:, 0],
                   x_source_transformed[:, 1],
                   c='r',
                   label='source')
        ax.scatter(x_target_transformed[:, 0],
                   x_target_transformed[:, 1],
                   c='b',
                   label='target')
        ax.xaxis.set_major_formatter(NullFormatter())
        ax.yaxis.set_major_formatter(NullFormatter())
        ax.axis('tight')
        ax.legend()
        plt.savefig(plot_directory + 'tsne_source' + source_name + '_target' +
                    target_name + '.png',
                    dpi=500)
예제 #4
0
def hc_tsne(
    X,
    initialization,
    tree,
    alpha=1e-3,
    weights=(0.5, 0.5, 0.0),
    margin=0.5,
    loss_logger=None,
    **tsne_kwargs,
):
    """Run openTSNE with custom `negative_gradient_method`, in which the
    hierarchical constraints are encoded in a regularization term.

    Args:
        X: ndarray (N, D)
        initialization: initialization embedding in 2D, (N, 2)
        tree: hierarchical constraints represented in tree form (using anytree lib)
        alpha: contribution of regularization term in the new objective function
        weights: weights of different elements in the regularization
        margin: margin in the triplet loss.
            The real margin m is calculated as `margin * dist(anchor, negative)`
        loss_logger: logger object (containing a dict) to store loss at each iter.
        **tsne_kwargs: openTSNE params

    Returns:
        Z: new embedding model, can be used as (N, 2) array,
            or tsne object for embedding new datapoints.
    """
    # from the tree-like constraints, create a regularization term by
    #   using the defined hierarchical triplet loss.
    tree_regularizer = partial(
        hierarchical_triplet_loss, tree=tree, margin=margin, weights=weights
    )

    # run openTSNE with custom negative gradient function
    tsne = TSNE(
        initialization=initialization,
        callbacks=ErrorLogger(),  # use this to evaluate kl_loss at every 10 iterations
        negative_gradient_method=partial(
            my_kl_divergence_bh,
            list_regularizers=[(alpha, tree_regularizer)],
            logger=loss_logger,
        ),
        **tsne_kwargs,
    )

    Z = tsne.fit(X)

    # now clear the regularizers from tsne object so we will not use them for embedding
    # new samples (of test set)
    Z.gradient_descent_params["negative_gradient_method"] = "bh"
    return Z
예제 #5
0
def compute_tsne(A):
    adata = A.copy()

    #tsne = TSNE(perplexity=30, metric="euclidean", callbacks=openTSNE.callbacks.ErrorLogger(),n_jobs=8, random_state=42, n_iter=750 )
    tsne = TSNE(perplexity=30,
                metric="euclidean",
                callbacks=None,
                n_jobs=10,
                random_state=42,
                n_iter=750)
    adata.varm['TSNE10'] = tsne.fit(adata.varm['TSVD'])

    return adata
예제 #6
0
def tsne(x, n=100000):
    from openTSNE import TSNE
    from openTSNE.callbacks import ErrorLogger

    x_in = x[:n, :]
    tsne = TSNE(
        perplexity=500,
        metric="euclidean",
        callbacks=ErrorLogger(),
        n_iter=2000,
        n_jobs=4,
    )
    x_embedded = tsne.fit(x_in)
    return x_embedded
예제 #7
0
 def fetch_algorithm(self):
     reducer = None
     if self.algorithm == "umap":
         reducer = umap.UMAP(random_state=42)
     else:
         reducer = TSNE(random_state=42)
     return reducer
예제 #8
0
    def setproyection(self, proyection_type="TSNE", **kwargs):
        r"""
        Calcular proyeccion de los datos

        Parameters
        ----------
        proyection_type: str
            Tipo de proyeccionhay tres opciones: TSNE, implementado con OpenTSNE;
                skTSNE, implementado por sklearn; y PCA, implementado por sklearn. 
        kwargs: dict
            Argumentos para la proyeccion (Perplexity, etc)
        """
        if self.emb.shape[1] == 2:
            X_proyected = self.emb.values
        elif proyection_type == "PCA":
            X_proyected = PCA(n_components=2, **kwargs).fit_transform(self.emb)
        elif proyection_type == "skTSNE":
            X_proyected = skTSNE(n_components=2,
                                 **kwargs).fit_transform(self.emb)
        elif proyection_type == "TSNE":
            X_proyected = pd.DataFrame(TSNE(n_components=2, n_jobs=8,
                                            **kwargs).fit(self.emb.values),
                                       index=self.emb.index,
                                       columns=["xdim", "ydim"])

        self.ids = self.emb.index
        self.proyected = pd.DataFrame(X_proyected,
                                      columns=["xdim", "ydim"],
                                      index=self.ids)
예제 #9
0
    def get_embedding_code(self, widgets):
        params = self.get_current_params(widgets)
        params['n_iter'] = int(widgets['_iteration'].value)
        tsne = TSNE(**params, min_grad_norm=0)

        # Since TSNE is a subclass of sklearn's BaseEstimator, repr(tsne)
        # provides the code to reproduce the resulting embedding. We remove
        # whitespace and linebreaks so we can later break the text as we like.
        expression = repr(tsne)
        expression = re.sub('\n', '', expression)
        expression = re.sub(' +', ' ', expression)

        prefix = 'tsne = '
        assignment_line = prefix + expression

        chars_until_params = len(prefix) + len('TSNE(')
        tw = TextWrapper(subsequent_indent=' ' * chars_until_params, width=80)
        assignment_line = tw.fill(assignment_line)

        code = ('# pip install openTSNE\n'
                'from openTSNE import TSNE\n'
                f'{assignment_line}\n'
                'tsne.fit(X)')

        # return as IPython.display.Code
        # -> repr will print the code in fixed width font
        # -> str will print the actual string containing \n
        return Code(code)
예제 #10
0
class OpenTsne(Transformer):
    """
    This transformer transformers all vectors in an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet]
    by means of tsne. This implementation used
    [open-tsne](https://opentsne.readthedocs.io/en/latest/tsne_algorithm.html).

    Important:
        OpenTSNE is a faster variant of TSNE but it only allows for <2 components.
        You may also notice that it is relatively slow. This unfortunately is a fact of TSNE.

        This embedding transformation might require you to manually install extra dependencies
        unless you installed via either;

        ```
        pip install whatlies[opentsne]
        pip install whatlies[all]
        ```

    Arguments:
        n_components: the number of compoments to create/add
        kwargs: keyword arguments passed to the OpenTsne implementation, includes things like `perplexity` [link](https://opentsne.readthedocs.io/en/latest/api/index.html)

    Usage:

    ```python
    from whatlies.language import SpacyLanguage
    from whatlies.transformers import OpenTsne

    words = ["prince", "princess", "nurse", "doctor", "banker", "man", "woman",
             "cousin", "neice", "king", "queen", "dude", "guy", "gal", "fire",
             "dog", "cat", "mouse", "red", "blue", "green", "yellow", "water",
             "person", "family", "brother", "sister"]

    lang = SpacyLanguage("en_core_web_md")
    emb = lang[words]

    emb.transform(OpenTsne(2)).plot_interactive_matrix('tsne_0', 'tsne_1')
    ```
    """

    def __init__(self, n_components=2, **kwargs):
        super().__init__()
        self.n_components = n_components
        self.kwargs = kwargs
        self.tfm = TSNE(n_components=n_components, **kwargs)

    def fit(self, embset):
        names, X = embset.to_names_X()
        self.emb = self.tfm.fit(X)
        self.is_fitted = True
        return self

    def transform(self, embset):
        names, X = embset.to_names_X()
        new_vecs = np.array(self.emb.transform(X))
        names_out = names + [f"tsne_{i}" for i in range(self.n_components)]
        vectors_out = np.concatenate([new_vecs, np.eye(self.n_components)])
        new_dict = new_embedding_dict(names_out, vectors_out, embset)
        return EmbeddingSet(new_dict, name=f"{embset.name}.tsne_{self.n_components}()")
예제 #11
0
 def __init__(self, n_components=None, random_state=None,
              initialization="pca", perplexity=30, n_jobs=6):
     self.n_components = n_components
     self.random_state = random_state
     self.tsne = OpenTSNE(n_components=self.n_components,
                          random_state=self.random_state,
                          initialization=initialization,
                          perplexity=perplexity,
                          n_jobs=n_jobs)
예제 #12
0
def reduce_dimension(embeddings, reduction='pca'):
    if reduction == 'pca':
        pca = PCA(n_components=2)
        embeddings = pca.fit_transform(embeddings)
    elif reduction == 'tsne':
        otsne = OTSNE(initialization='pca',
                      n_jobs=8,
                      callbacks=ErrorLogger(),
                      negative_gradient_method='bh')
        embeddings = otsne.fit(embeddings)


#         stsne = STSNE()
#         embeddings = stsne.fit_transform(embeddings)
    elif reduction == 'none':
        pass
    else:
        raise Exception
    return embeddings
예제 #13
0
class TSNEWrapper:
    def __init__(self, params, random_seed):
        self.tsneer = TSNE(n_components=params['embed_dim'],
                           random_state=random_seed)

    def fit(self, data):
        self.embedding = self.tsneer.fit(data)

    def transform(self, data):
        new_embedded_data = self.embedding.transform(data)
        return new_embedded_data
예제 #14
0
    def run_transformation(self, X, y, transformation_params, callback):
        class CallbackAdapter:
            def __init__(self, callback, early_exaggeration_iter):
                self.callback = callback
                self.exaggeration_phase = early_exaggeration_iter > 0
                self.early_exaggeration_iter = early_exaggeration_iter

            def __call__(self, iteration, error, embedding):
                if not self.exaggeration_phase:
                    iteration += self.early_exaggeration_iter
                if self.exaggeration_phase and iteration == self.early_exaggeration_iter:
                    self.exaggeration_phase = False

                self.callback(
                    'embedding', iteration,
                    dict(embedding=embedding.view(np.ndarray),
                         error_metrics=dict(kl_divergence=error)))

        callback_adapter = CallbackAdapter(
            callback, transformation_params['early_exaggeration_iter'])

        tsne = TSNE(
            **transformation_params,
            min_grad_norm=0,  # never stop
            n_iter=10000000,  # TODO
            callbacks=callback_adapter,
            callbacks_every_iters=1)

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=NumbaWarning)
            callback(
                'start', 0,
                dict(error_metrics=[
                    dict(name='kl_divergence', label='KL divergence:')
                ]))
            callback('status', 0, dict(message='Initializing TSNE'))
            tsne.fit(X)
예제 #15
0
        def calculate_dim_red():
            self.embedding_train = None
            sc.pp.highly_variable_genes(self.data, n_top_genes=500)
            sc.pp.pca(self.data, n_comps=self.n_comps, zero_center=True)
            X_pca = self.data.obsm['X_pca']
            tSNE_init = X_pca[:, :2]
            print('feature selection and PCA compression finished ')

            if self.UMAP:
                import umap
                reducer = umap.UMAP(n_components=n_components)
                X_embedded = reducer.fit_transform(X_pca)
                self.results['UMAP1'] = X_embedded[:, 0].tolist()
                if n_components == 2:
                    self.results['UMAP2'] = X_embedded[:, 1].tolist()
                print('UMAP finished')

            if self.tSNE:
                from openTSNE import TSNE
                from openTSNE.callbacks import ErrorLogger

                tsne = TSNE(perplexity=30,
                callbacks=ErrorLogger(),
                initialization='pca',
                random_state=42,
                early_exaggeration_iter=50,
                n_components=2)

                %time embedding_train = tsne.fit(X_pca)
                self.embedding_train = embedding_train
                

                self.results['tSNE1'] = embedding_train.T[0].tolist()
                self.results['tSNE2'] = embedding_train.T[1].tolist()
                print('tSNE finished')
            return self.data, self.results
예제 #16
0
class TSNEm:
    def __init__(self, n_components=None, random_state=None,
                 initialization="pca", perplexity=30, n_jobs=6):
        self.n_components = n_components
        self.random_state = random_state
        self.tsne = OpenTSNE(n_components=self.n_components,
                             random_state=self.random_state,
                             initialization=initialization,
                             perplexity=perplexity,
                             n_jobs=n_jobs)


    def fit_transform(self, X):
        embeddings = self.tsne.fit(X)
        self.embeddings = embeddings
        return embeddings


    def transform(self, x):
        return self.embeddings.transform(x)
예제 #17
0
def dimension_reduction(data):
    #get true labels
    m, n = data.obs.shape
    if n > 0:
        labels = pd.unique(data.obs.iloc[:, n - 1])
    else:
        labels = pd.unique(data.obs.index)
    if len(labels) != 0:
        num_cluster = len(labels)
    else:
        num_cluster = m
    #map colors
    cmap = plt.get_cmap('Spectral')
    colors = [cmap(i) for i in np.linspace(0, 1, num_cluster)]
    color_list = []
    for i in range(m):
        if n > 0:
            color_list.append(
                colors[np.where(data.obs.iloc[i, n - 1] == labels)[0][0]])
        else:
            color_list.append(colors[i])

    print("Preprocessing: Executing Dimension Reduction...")
    #TSNE
    from openTSNE import TSNE
    tsne_embedded = TSNE().fit(data.X)
    fig = plt.figure(figsize=(16, 7))
    warnings.filterwarnings("ignore", module="matplotlib")
    plt.scatter(tsne_embedded[:, 0], tsne_embedded[:, 1], c=color_list, s=1.5)
    plt.title(('t-SNE visualization'))

    #UMAP
    import umap
    umap_embedded = umap.UMAP(n_neighbors=5,
                              min_dist=0.3,
                              metric='correlation').fit_transform(data.X)
    fig = plt.figure(figsize=(16, 7))
    plt.scatter(umap_embedded[:, 0], umap_embedded[:, 1], c=color_list, s=1.5)
    plt.title('UMAP visualization')
    return tsne_embedded, umap_embedded
예제 #18
0
파일: views.py 프로젝트: i2v2y/BME498
def dimension_reduction(data):
    #get true labels
    m, n = data.obs.shape
    if n > 0:
        labels = pd.unique(data.obs.iloc[:, n - 1])
    else:
        labels = pd.unique(data.obs.index)
    if len(labels) != 0:
        num_cluster = len(labels)
    else:
        num_cluster = m

    #TSNE
    from openTSNE import TSNE
    tsne_embedded = TSNE().fit(data.X)

    #UMAP
    import umap
    umap_embedded = umap.UMAP(n_neighbors=5,
                              min_dist=0.3,
                              metric='correlation').fit_transform(data.X)

    return tsne_embedded, umap_embedded
예제 #19
0
    # seed = i + 41
    seed = seed_lst[i]
    # seed = 42

    start1 = time.time()
    reducer = umap.UMAP(metric='precomputed', n_neighbors=k, random_state=seed)
    embedding_hub = reducer.fit_transform(X)
    elapsed_time1 = time.time() - start1

    start2 = time.time()
    reducer = umap.UMAP(n_neighbors=k, random_state=seed)
    embedding_org = reducer.fit_transform(X)
    elapsed_time2 = time.time() - start2

    start3 = time.time()
    embedding_TSNE = TSNE().fit(X)
    elapsed_time3 = time.time() - start3

    emb_org_list.append(embedding_org)
    emb_hub_list.append(embedding_hub)

    time_org_list.append(elapsed_time2)
    time_hub_list.append(elapsed_time1)
    print('org: ', elapsed_time2)
    print('hub: ', elapsed_time1)
    print('TSNE:', elapsed_time3)

time_org = np.array(time_org_list)
time_hub = np.array(time_hub_list)

mean_time_org = np.mean(time_org)
예제 #20
0
def _tsne_projection(data, num_tsne_components=2, num_pca_components=50):
    pca = PCA(n_components=num_pca_components)  # PCA first speed up the tSNE
    pca_data = pca.fit_transform(data)
    tsne = TSNE(n_components=num_tsne_components)
    data_embedded = tsne.fit(pca_data)
    return data_embedded
예제 #21
0
def get_results(data_path, model_path, different_layer, look_embedding, tsne,
                distance, random_mode, multihead):
    results = []
    tokenizer = tokenizer_class.from_pretrained(model_path)
    if different_layer:
        if multihead:
            configer = BertConfig.from_pretrained(model_path)
            configer.__setattr__('get_multihead', True)
            configer.__setattr__('output_hidden_states', True)
            model = model_class.from_pretrained(model_path, config=configer)
        else:
            model = model_class.from_pretrained(model_path,
                                                output_hidden_states=True)
        if distance == 'cos':
            if not random_mode:
                if 'squad_bert_base' in model_path:
                    output_path = data_path.split(
                        '.')[0] + 'diff_res_squad_finetune_cos_2.txt'
                elif 'nq_bert_base' in args.model_path:
                    output_path = data_path.split(
                        '.')[0] + 'diff_res_nq_finetune_cos_2.txt'
                elif 'bert-base-uncased' in args.model_path:
                    output_path = data_path.split(
                        '.')[0] + 'diff_res_cos_2.txt'
            else:
                if 'squad_bert_base' in model_path:
                    output_path = data_path.split(
                        '.')[0] + 'diff_random_res_squad_finetune_cos_2.txt'
                elif 'nq_bert_base' in args.model_path:
                    output_path = data_path.split(
                        '.')[0] + 'diff_random_res_nq_finetune_cos_2.txt'
                elif 'bert-base-uncased' in args.model_path:
                    output_path = data_path.split(
                        '.')[0] + 'diff_random_res_cos_2.txt'
        elif distance == 'euc':
            if not random_mode:
                if 'squad_bert_base' in model_path:
                    output_path = data_path.split(
                        '.')[0] + 'diff_res_squad_finetune_euc.txt'
                elif 'nq_bert_base' in args.model_path:
                    output_path = data_path.split(
                        '.')[0] + 'diff_res_nq_finetune_euc.txt'
                elif 'bert-base-uncased' in args.model_path:
                    output_path = data_path.split('.')[0] + 'diff_res_euc.txt'
            else:
                if 'squad_bert_base' in model_path:
                    output_path = data_path.split(
                        '.')[0] + 'diff_random_res_squad_finetune_euc.txt'
                elif 'nq_bert_base' in args.model_path:
                    output_path = data_path.split(
                        '.')[0] + 'diff_random_res_nq_finetune_euc.txt'
                elif 'bert-base-uncased' in args.model_path:
                    output_path = data_path.split(
                        '.')[0] + 'diff_random_res_euc.txt'
    else:
        model = model_class.from_pretrained(model_path)
        output_path = data_path.split('.')[0] + '_res_squad_finetune_euc.txt'
    with open(data_path, 'r', encoding='utf-8') as f:
        data = json.load(f)['data']
        print(len(data))
    count = 0
    #two_embeddings = []
    for qp_pair in tqdm(data):
        question, query_idx, paragraph, para_idx = get_query_para(qp_pair)
        count += 1
        if random_mode:
            if 'WSC' in data_path or 'coreference' in data_path:
                random_index_1, random_index_2 = get_random_para_WSC(
                    paragraph, tokenizer)
                result = bert(question, random_index_1, paragraph,
                              random_index_2, model, tokenizer,
                              different_layer, look_embedding, tsne, distance,
                              multihead)
            elif 'sentence_ranking' in data_path:
                random_para, random_index = get_random_para(
                    data, question, tokenizer)
                result = bert(question, query_idx, paragraph, para_idx, model,
                              tokenizer, different_layer, look_embedding, tsne,
                              distance, multihead, random_para, random_index)
            else:
                random_para, random_index = get_random_para(
                    data, question, tokenizer)
                result = bert(question, query_idx, paragraph, para_idx, model,
                              tokenizer, different_layer, look_embedding, tsne,
                              distance, multihead, random_para, random_index)
        else:
            result = bert(question, query_idx, paragraph, para_idx, model,
                          tokenizer, different_layer, look_embedding, tsne,
                          distance, multihead)
        if not result:
            print(count)
            continue
        else:
            results.append(result[0])
            #two_embeddings.append(result[1])
    # import pickle
    # pickle.dump(two_embeddings,open('/data/caijie/analyse_bert/data/probing/bigram_shift/two_embedding.pickle','wb'))

    if tsne:
        from openTSNE import TSNE
        import pickle
        pickle.dump(
            np.array(tsne_arrays),
            open(
                '/data/home/t-jicai/caijie/analyse_bert/embedding_vector_tsne.pickle',
                'wb'))
        res = TSNE().fit(np.array(tsne_arrays))
        with open(
                '/data/home/t-jicai/caijie/analyse_bert/embedding_vector_tsne.txt',
                'w',
                encoding='utf-8') as fout:
            for r in res:
                fout.write(str(r))
                fout.write('\n')
    with open(output_path, 'w', encoding='utf-8') as fout:
        for res in results:
            fout.write(str(res))
            fout.write('\n')
예제 #22
0
from sklearn.datasets import load_digits
from openTSNE import TSNE
from matplotlib import pyplot as plt

digits = load_digits()
X, y = digits["data"], digits["target"]

embedding = TSNE().fit(X)
embedding[:5]

target_ids = range(len(digits.target_names))

plt.figure(figsize=(6, 5))
colors = 'r', 'g', 'b', 'c', 'm', 'y', 'k', 'w', 'orange', 'purple'
for i, c, label in zip(target_ids, colors, digits.target_names):
    plt.scatter(embedding[y == i, 0], embedding[y == i, 1], c=c, label=label)
plt.legend()
plt.show()
예제 #23
0
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from openTSNE import TSNE
from openTSNE.callbacks import ErrorLogger
from openTSNE import utils

df = pd.read_csv("train.csv")
df = df[:100]
label = df.label
df.drop("label", axis=1, inplace=True)
standardized_data = StandardScaler().fit_transform(df)
print(standardized_data.shape)

tsne = TSNE(
    perplexity=30,
    metric="euclidean",
    callbacks=ErrorLogger(),
    n_jobs=8,
    random_state=42,
)

embedding_train = tsne.fit(standardized_data)
utils.plot(embedding_train, label, colors=utils.MACOSKO_COLORS)
예제 #24
0
def make_data_faster(dataset_shortname):
    k_folder = '/home/single_cell_analysis/kallisto_out_single_bustools_dev/kallisto_' + dataset_shortname
    if dataset_shortname in ["pbmc_1k_v3", "pbmc_10k_v3", "neuron_10k_v3"]:
        dataset_shortname = dataset_shortname.split(
            "_")[0] + dataset_shortname.split(
                "_")[1] + "_" + dataset_shortname.split("_")[2]
    c_folder = '/home/single_cell_analysis/cellranger_out/cellranger3_' + dataset_shortname + '_out/outs/filtered_feature_bc_matrix'
    c_raw_folder = '/home/single_cell_analysis/cellranger_out/cellranger3_' + dataset_shortname + '_out/outs/raw_feature_bc_matrix'

    c_raw = anndata.AnnData(
        scipy.io.mmread(os.path.join(c_raw_folder, 'matrix.mtx.gz')).tocsr().T)
    c_barcodes = pd.read_csv(os.path.join(c_raw_folder, 'barcodes.tsv.gz'),
                             index_col=0,
                             header=None,
                             names=['barcode'])
    c_barcodes.index = c_barcodes.index.str.slice(0, 16, 1)
    c_raw.obs = c_barcodes
    c_raw.var = pd.read_csv(os.path.join(c_raw_folder, 'features.tsv.gz'),
                            header=None,
                            index_col=0,
                            names=['ensembl_id', 'gene_name', 'kind'],
                            sep='\t')
    print('Loaded c raw mtx:', c_raw.X.shape)

    del c_barcodes

    # load c filtered matrix
    c = anndata.AnnData(
        scipy.io.mmread(os.path.join(c_folder, 'matrix.mtx.gz')).tocsr().T)
    c_barcodes = pd.read_csv(os.path.join(c_folder, 'barcodes.tsv.gz'),
                             index_col=0,
                             header=None,
                             names=['barcode'])
    c_barcodes.index = c_barcodes.index.str.slice(0, 16, 1)
    c.obs = c_barcodes
    c.var = pd.read_csv(os.path.join(c_folder, 'features.tsv.gz'),
                        header=None,
                        index_col=0,
                        names=['ensembl_id', 'gene_name', 'kind'],
                        sep='\t')
    print('Loaded c filtered mtx:', c.X.shape)

    del c_barcodes

    ## load kallisto raw matrix
    k_raw = anndata.AnnData(
        scipy.io.mmread(os.path.join(k_folder, 'genes.mtx')).tocsr())
    k_raw.obs = pd.read_csv(os.path.join(k_folder, 'genes.barcodes.txt'),
                            index_col=0,
                            header=None,
                            names=['barcode'])
    k_raw.var = pd.read_csv(os.path.join(k_folder, 'genes.genes.txt'),
                            header=None,
                            index_col=0,
                            names=['ensembl_id'],
                            sep='\t')
    print('Loaded k raw mtx:', k_raw.X.shape)

    # truncdates the ensembl version number off the kallisto labels
    k_raw.var['full_emsembl_id'] = k_raw.var.index
    k_raw.var.index = k_raw.var['full_emsembl_id'].str.slice(0, 18)

    if dataset_shortname in ['hgmm1k_v2', 'hgmm1k_v3', 'hgmm10k_v3']:
        k_raw.var.index = k_raw.var['full_emsembl_id']

        # do this as late as possible
    k = k_raw[c.obs.index.values]
    print('Loaded k filtered mtx:', k.X.shape)

    c_raw.obs['counts'] = c_raw.X.sum(1)
    c_raw.obs['ngenes'] = np.array((c_raw.X > 0).sum(1))
    c_raw = c_raw[c_raw.obs['counts'] > 0]
    c_raw.layers['log1p'] = np.log1p(c_raw.X)
    c_raw.obs['log10counts'] = np.log10(c_raw.obs['counts'])
    print('Cell Ranger raw:', c_raw.shape)

    # count UMIs, genes, log transform raw kallisto barcodes
    # first remove kallisto barcodes with 0 gene counts

    k_raw.obs['counts'] = k_raw.X.sum(1)
    k_raw.obs['ngenes'] = np.array((k_raw.X > 0).sum(1))
    k_raw = k_raw[k_raw.obs['counts'] > 0]
    k_raw.layers['log1p'] = np.log1p(k_raw.X)
    k_raw.obs['log10counts'] = np.log10(k_raw.obs['counts'])
    print('kallisto raw:', k_raw.shape)

    c.obs['counts'] = c.X.sum(1)
    c.obs['ngenes'] = np.array((c.X > 0).sum(1))
    c = c[c.obs['counts'] > 0]
    c.layers['log1p'] = np.log1p(c.X)
    c.obs['log10counts'] = np.log10(c.obs['counts'])
    print('Cell Ranger filtered:', c.shape)

    # count UMIs, genes, log transform filtered kallisto barcodes
    # first remove kallisto barcodes with 0 gene counts

    k.obs['counts'] = k.X.sum(1)
    k.obs['ngenes'] = np.array((k.X > 0).sum(1))
    k = k[k.obs['counts'] > 0]
    k.layers['log1p'] = np.log1p(k.X)
    k.obs['log10counts'] = np.log10(k.obs['counts'])
    print('kallisto filtered:', k.shape)

    joint_obs = k_raw.obs.join(c_raw.obs,
                               how='outer',
                               lsuffix='-kallisto',
                               rsuffix='-tenx')
    joint_obs = joint_obs.fillna(0)
    print('Total barcodes seen')
    print(len(joint_obs))

    # barcodes seen by both
    common_obs = k_raw.obs.join(c_raw.obs,
                                how='inner',
                                lsuffix='-kallisto',
                                rsuffix='-tenx')
    print('Barcodes seen by both')
    print(len(common_obs))

    kobs = k_raw.obs.join(c_raw.obs,
                          how='left',
                          lsuffix='-kallisto',
                          rsuffix='-tenx')
    kobs = kobs.sort_values(by=['counts-kallisto'], ascending=False)
    print('Barcodes seen by kallisto missed by Cell Ranger')
    print(len(joint_obs) - len(kobs))

    # just Cell Ranger observations
    tobs = c_raw.obs.copy()
    tobs = tobs.sort_values('counts', ascending=False)
    print('Barcodes seen by Cell Ranger missed by kallisto')
    print(len(joint_obs) - len(tobs))

    # ## Compute correlations between kallisto and Cell Ranger
    # handy and fast function for computing correlation on sparse matrices
    def sparse_M_std(X):
        n = X.shape[1]
        return np.sqrt(n * X.multiply(X).sum(1) -
                       np.multiply(X.sum(1), X.sum(1)))

    def sparse_M_corr(X, Y):
        X_std = sparse_M_std(X)
        Y_std = sparse_M_std(Y)
        XY_std = np.multiply(X_std, Y_std)

        n = X.shape[1]
        XY_cov = n * X.multiply(Y).sum(1) - np.multiply(X.sum(1), Y.sum(1))
        R = np.divide(XY_cov, XY_std)
        return np.squeeze(np.asarray(R))

    raw_counts_correlation = sparse_M_corr(
        k_raw[common_obs.index].layers['log1p'],
        c_raw[common_obs.index].layers['log1p'])
    filtered_counts_correlation = sparse_M_corr(
        k_raw[c.obs.index].layers['log1p'], c_raw[c.obs.index].layers['log1p'])
    print('Correlations computed!')

    tsvd = TruncatedSVD(n_components=10)
    TSVD = tsvd.fit_transform(k.layers['log1p'])
    k.obsm['TSVD'] = TSVD
    k.obsm['TSVD']
    print('TSVD variance ratios:\n', list(tsvd.explained_variance_ratio_))
    print(datetime.datetime.now())

    tsvd = TruncatedSVD(n_components=10)
    TSVD = tsvd.fit_transform(c.layers['log1p'])
    c.obsm['TSVD'] = TSVD
    c.obsm['TSVD']
    print('TSVD variance ratios:\n', list(tsvd.explained_variance_ratio_))
    print(datetime.datetime.now())

    print('Calculating L1 distances...')

    # taking manhattan distance between matrices
    dnck = manhattan_distances(c.layers['log1p'], k.layers['log1p'])
    dnkk = manhattan_distances(k.layers['log1p'], k.layers['log1p'])
    print(datetime.datetime.now())

    # nkc are the kallisto-cellranger distances
    nck = np.diagonal(dnck)

    # ncc are the kallisto-kallisto distances
    nkk = []
    for row in dnkk:
        val = np.partition(row, 1)[1]
        nkk.append(val)
    print('L1 distances done!')
    print(datetime.datetime.now())

    print('Doing t-SNE')
    print(datetime.datetime.now())
    tsne = TSNE(perplexity=30,
                metric="euclidean",
                callbacks=openTSNE.callbacks.ErrorLogger(),
                n_jobs=8,
                random_state=42,
                n_iter=750)
    k.obsm['TSNE10'] = tsne.fit(k.obsm['TSVD'])
    print('kallisto TSNE-10 done.')
    print(datetime.datetime.now())

    # Perform TSNE on top 10 truncated SVD components of Cell Ranger filtered matrix

    print('Doing t-SNE on top 10 PC for Cell Ranger')
    #
    print(datetime.datetime.now())
    tsne = TSNE(perplexity=30,
                metric="euclidean",
                callbacks=openTSNE.callbacks.ErrorLogger(),
                n_jobs=8,
                random_state=42,
                n_iter=750)
    c.obsm['TSNE10'] = tsne.fit(c.obsm['TSVD'])
    print('Cell Ranger TSNE-10 done.')
    print(datetime.datetime.now())

    c_raw.write(
        os.path.join("./write_data/" + dataset_shortname + '_tenx_raw.h5ad'))
    k_raw.write(
        os.path.join("./write_data/" + dataset_shortname +
                     '_kallisto_raw.h5ad'))
    k.write(
        os.path.join("./write_data/" + dataset_shortname + '_kallisto.h5ad'))
    c.write(os.path.join("./write_data/" + dataset_shortname + '_tenx.h5ad'))

    with open(os.path.join("./write_data/" + dataset_shortname + '_kobs.pkl'),
              'wb') as handle:
        pickle.dump(kobs, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open(os.path.join("./write_data/" + dataset_shortname + '_tobs.pkl'),
              'wb') as handle:
        pickle.dump(tobs, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open(
            os.path.join("./write_data/" + dataset_shortname +
                         '_common_obs.pkl'), 'wb') as handle:
        pickle.dump(common_obs, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open(
            os.path.join("./write_data/" + dataset_shortname +
                         '_joint_obs.pkl'), 'wb') as handle:
        pickle.dump(joint_obs, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open(os.path.join("./write_data/" + dataset_shortname + '_nkk.pkl'),
              'wb') as handle:
        pickle.dump(nkk, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open(os.path.join("./write_data/" + dataset_shortname + '_nck.pkl'),
              'wb') as handle:
        pickle.dump(nck, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open(
            os.path.join("./write_data/" + dataset_shortname +
                         '_raw_counts_correlation.pkl'), 'wb') as handle:
        pickle.dump(raw_counts_correlation,
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)

    with open(
            os.path.join("./write_data/" + dataset_shortname +
                         '_filtered_counts_correlation.pkl'), 'wb') as handle:
        pickle.dump(filtered_counts_correlation,
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)
예제 #25
0
 def __init__(self, params, random_seed):
     self.tsneer = TSNE(n_components=params['embed_dim'],
                        random_state=random_seed)
예제 #26
0
파일: plot.py 프로젝트: charfole/RECLE-Code
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.layouts import row
from bokeh.layouts import gridplot
from bokeh.models import BoxSelectTool, LassoSelectTool
from bokeh.plotting import figure, curdoc
TOOLS = "pan,wheel_zoom,box_select,lasso_select,reset"
output_notebook()

train_data = pd.read_csv(data_dict + 'train.csv')

# Read learned embedding
embed_data = pd.read_csv('xxx.csv')

# Dimension reduction
embedding = TSNE(perplexity=24, random_state=10).fit(embed_data.iloc[:, 1:])

data_group = {
    'vote==0': (train_data.iloc[:, 1] >= -1) & (train_data.iloc[:, 1] <= 0),
    '1<=vote<=2': (train_data.iloc[:, 1] >= 1) & (train_data.iloc[:, 1] <= 2),
    '3<=vote<=4': (train_data.iloc[:, 1] >= 3) & (train_data.iloc[:, 1] <= 4),
    'vote==5': (train_data.iloc[:, 1] >= 5) & (train_data.iloc[:, 1] <= 99),
}

p_1 = figure(tools=TOOLS,
             plot_width=400,
             plot_height=400,
             min_border=10,
             min_border_left=50,
             toolbar_location="above",
             title="1 <= votes <= 4")
예제 #27
0
                    negative_gradient_method='bh')
embedding_umap = reducer.fit_transform(data_no_label)
t1 = time()
print('UMAP running time is: ' + str(t1 - t0) + ' s')

fig, ax = plt.subplots()
scatter = ax.scatter(
    embedding_umap[:, 0],
    embedding_umap[:, 1],
    c=[sns.color_palette(n_colors=20)[x] for x in label_group])
plt.axis('off')

# tSNE
t0 = time()
embedding_tsne = TSNE(n_components=2,
                      random_state=42,
                      n_jobs=-1,
                      negative_gradient_method='bh').fit(data_no_label)
t1 = time()
print('t-SNE running time is: ' + str(t1 - t0) + ' s')

fig, ax = plt.subplots()
scatter = ax.scatter(
    embedding_tsne[:, 0],
    embedding_tsne[:, 1],
    c=[sns.color_palette(n_colors=20)[x] for x in label_group])
plt.axis('off')

# MDS
t0 = time()
embedding_mds = MDS(n_components=2, n_jobs=-1).fit_transform(data_no_label)
t1 = time()
예제 #28
0
            15.24742297, 23.48066375, 37.34107189, 58.27652395, 87.24048423,
            137.33961493, 211.00561713, 374.36120544, 576.90813121,
            983.37544116
        ])
        perplexity = np.arange(5, 55, 5)

        for i in range(len(learning_rate)):
            for j in range(len(perplexity)):

                # read data
                x, label = get_data(args.data)

                # run TSNE
                y = TSNE(n_components=args.dim,
                         perplexity=perplexity[j],
                         learning_rate=learning_rate[i],
                         n_jobs=-1,
                         verbose=True).fit(x)

                # save as csv
                path = os.path.join(os.getcwd(), "visualization", "public",
                                    "results", args.data)
                save_csv(path,
                         alg_name=f"tsne_{perplexity[j]}_{learning_rate[i]}",
                         data=y,
                         label=label)
    else:
        y = TSNE(n_components=args.dim,
                 perplexity=perplexity[j],
                 learning_rate=learning_rate[i],
                 n_jobs=-1,
예제 #29
0
            legend_kwargs_.update(legend_kwargs)
        ax.legend(handles=legend_handles, **legend_kwargs_)

    matplotlib.pyplot.show()


if __name__ == '__main__':
    data_dir = "D:\\2020BUAA\dataset\JNU"
    pic_data = os.path.join(data_dir, "JNU_data_0-1.pk")

    with open(pic_data, 'rb') as file_1:
        txt_all_data = pickle.load(file_1)

    source_train_X, source_train_y = txt_all_data[0]
    source_val_X, source_val_y = txt_all_data[1]
    target_train_X, target_train_y = txt_all_data[2]
    target_val_X, target_val_y = txt_all_data[3]

    x, y = source_val_X, source_val_y

    tsne = TSNE(
        perplexity=30,
        n_iter=100,
        metric="euclidean",
        callbacks=ErrorLogger(),
        n_jobs=8,
        random_state=42,
    )
    embedding = tsne.fit(x)
    viz_plot(embedding, y, colors=MOUSE_10X_COLORS, draw_centers=False)
예제 #30
0
redux = UMAP(n_components=10)

df.index = df['name']
df.drop(columns='name', inplace=True)

d = {}
for name, v in df.iterrows():
    v_ = [float(i) for i in v]
    vn = normalize(v_)
    d[name] = vn

# redux = UMAP(n_components=n_components)
# projection = redux.fit_transform(list(d.values()))

redux = TSNE(n_components=n_components)
projection = redux.fit(np.array(list(d.values())))

# dfmeta[dfmeta.id == '19142e05-7365-4b55-abcc-9ba0dec235d2'].country__autocolor.item()

with open('embedding.csv', 'w+') as out:
    header = [f'c{str(i+1)}' for i in range(n_components)]
    out.write('name,country,study,' + ','.join(header) + '\n')  # header
    for leaf, v in zip(d.keys(), projection):
        study = dfmeta[dfmeta.id == leaf].study__autocolor.item()
        country = dfmeta[dfmeta.id == leaf].country__autocolor.item()
        out.write(
            f'{leaf},{country},{study},{",".join([str(i) for i in v])}\n')

with open('embedding_raw.csv', 'w+') as out:
    header = [f'c{str(i+1)}' for i in range(n_components)]