Пример #1
0
    def tsne_reduce(self, data):
        """
		Use Barnes-Hut approximation of t-SNE to reduce dimensionality
		of features from N-dim to 2D for plotting.
		"""
        map2d = bhtsne.run_bh_tsne(data,
                                   no_dims=self.opts.n_dims,
                                   max_iter=self.opts.n_iters,
                                   use_pca=False,
                                   verbose=True)

        return map2d
Пример #2
0
def run_tsne(features_file, colors_file, output_prefix
             , filter_sample=[]
             , filter_cluster=[]
             , lst=[]
             , draw_per = 1.0
             , iter = 1000
             , perplexity = 50):
    # read data
    data_df = pd.read_table(features_file, header=None)
    cluster_colors = pd.read_table(colors_file, header=None)
    print(data_df.head())

    # make dataframe pretty
    cluster_colors = cluster_colors.rename(columns={1:'color'})
    cluster_colors["color"] = [int(extract_num.findall(str(x))[0]) for x in cluster_colors["color"].tolist()]
    print(cluster_colors.head())
    #cluster_colors = cluster_colors.rename(columns={0:0})

    # filter by samples
    if len(filter_sample) > 0:
        filter1 = []
        for x in cluster_colors[0].tolist():
            for it in filter_sample:
                st = "sample" + it + "-"
                if x.startswith(st):
                    filter1.append(x)
        cluster_colors = cluster_colors[cluster_colors[0].isin(filter1)]

    # filter by percent
    if draw_per < 1:
        clusters = divide_by_cluster(cluster_colors[0].tolist(), cluster_colors["color"].tolist())
        filter2 = take_first_per(clusters, lst)
        s = set(filter2)
        lst_new = []
        for n in lst:
            for x in cluster_colors[0].tolist():
                if x.startswith(n):
                    print x
                    lst_new.append(x)
                    if x not in s:
                        filter2.append(x)
        lst = lst_new
        cluster_colors = cluster_colors[cluster_colors[0].isin(filter2)]


    # merge data
    mapped = pd.merge(cluster_colors, data_df, on=0)

    # filter by length
    mapped["length"] = [int(x.split("_")[3]) for x in mapped[0].tolist()]
    mapped = mapped[mapped["length"] > 2000]
    print(mapped)

    # normalize like in CONCOCT
    data = mapped.as_matrix(columns=mapped.columns[2:-1])

    v = (1.0/mapped["length"]).as_matrix()[:, np.newaxis]
    data = data + v
    along_Y = np.apply_along_axis(sum, 0, data)
    data = data/along_Y[None, :]
    along_X = np.apply_along_axis(sum, 1, data)
    data = data/along_X[:, None]
    data = np.log(data)
    #print(data)

    embedding_array = bhtsne.run_bh_tsne(data, initial_dims=data.shape[1], perplexity=perplexity, max_iter=iter)
    mapped["x"] = embedding_array[:, 0]
    mapped["y"] = embedding_array[:, 1]

    # draw result of TSNE on scatter plot

    pp = PdfPages(output_prefix)


    # filter clusters to show
    fc = filter_cluster
    if len(fc) > 0:
        filtered = mapped[mapped["color"].isin(fc)]
        #mapped = filtered
    else:
        filtered = mapped

    fig = pyplot.figure()

    # draw scatter plot
    color = mapped["color"].tolist()
    mx_color = max(color)
    pyplot.scatter(mapped["x"].tolist(), mapped["y"].tolist(), c=[cm.spectral(float(i) /mx_color) for i in color])

    # make a legend for specific clusters
    # find cluster centers
    x = filtered["x"].tolist()
    y = filtered["y"].tolist()
    mp = divide_by_color(x, y, filtered["color"].tolist())
    points, names = find_cluster_centers(mp)
    patches = []
    dcolors = list(set(color))
    for c in dcolors:
        if c in fc and len(fc) < 5:
            patches.append(mpatches.Patch(color=cm.spectral(float(c)/mx_color), label='C-'+ str(c)))
    pyplot.legend(handles=patches)
    draw_points(points, names, fig)

    # mark specific points
    filtered = mapped[mapped[0].isin(lst)]
    pyplot.scatter(filtered["x"].tolist(), filtered["y"].tolist(), marker="p", edgecolors='black', c=[cm.spectral(float(i) /mx_color) for i in filtered["color"].tolist()])


    pyplot.title('Perp = '+ str(perplexity)+ ' Iter = ' + str(iter))
    pp.savefig()

    pp.close()
Пример #3
0
 if maxima:
     print("Maxima! All the matrix to bhtsne...")
     Xmax_to_bhtsne = X_TFIDF.toarray()
 ndms = 2
 tht = 0.5
 rndsd = 42
 vrbs = True
 prplxt_range = [30, 50]
 for prplxt in prplxt_range:
     print('bhtsne params: perplexity %d and n_dims %d' % (prplxt, ndms))
     if reducto:
         X_embedded = np.asarray(
             list(
                 bhtsne.run_bh_tsne(Xred_to_bhtsne,
                                    no_dims=ndms,
                                    perplexity=prplxt,
                                    theta=tht,
                                    randseed=rndsd,
                                    verbose=vrbs)))
         print("Plotting Reducto...")
         if kmers:
             file_fig = str(
                 kmersize
             ) + "-kmersize_kmers_Reducto_Reg_True_length_" + str(
                 prplxt) + "-per.png"
         else:
             file_fig = str(
                 kmersize
             ) + "-kmersize_newtoken_Reducto_Reg_True_length_" + str(
                 prplxt) + "-per.png"
         file_fig = os.path.join("./", file_fig)
         y_c = []
Пример #4
0
import numpy as np
import pylab
import sys
from datetime import datetime

sys.path.append('./wrapper')
import bhtsne

print('Loading data...')
data = np.loadtxt("./data/mnist2500_X.txt")
labels = np.loadtxt("./data/mnist2500_labels.txt")
figfile = './results/bhtsne_demo.png'

print('Processing...')
start = datetime.now()
embedding_array = bhtsne.run_bh_tsne(data, initial_dims=data.shape[1])
end = datetime.now()
elapsedTime = (end - start).total_seconds()
print('Process time ', elapsedTime, 's')

pylab.scatter(embedding_array[:, 0], embedding_array[:, 1], 20, labels)
pylab.savefig(figfile)
pylab.show()
Пример #5
0
print(pc_data.shape)

# DROPOUT
## sums = np.sum(pc_data**2, axis=1)
## print(min(sums))

## pc_data = pc_data[sums > .1, :]

## print(pc_data.shape)

# Y_file = 'bh_' + infile + '_out.txt'
# Y_samples = np.loadtxt(Y_file)

# DEBUG PRESENCE OF INITIAL SAMPLES
# weights = [1.0, 2.0, 3.0]
for i, p in enumerate(perps):
    embedded, betas, orig_densities, emb_densities = bhtsne.run_bh_tsne(
        pc_data,
        initial_dims=pc_data.shape[1],
        theta=0.3,
        verbose=True,
        perplexity=p,
        max_iter=1000,
        use_pca=False)

    print embedded.shape, betas.shape
    np.savetxt(file_root.format(infile, 'out', p), embedded)
    np.savetxt(file_root.format(infile, 'betas', p), betas)
    np.savetxt(file_root.format(infile, 'marg_origD', p), orig_densities)
    np.savetxt(file_root.format(infile, 'marg_embD', p), emb_densities)
Пример #6
0
print("Clusterring with t-SNE")
'''Perform Barnes-hut t-SNE approximation on the encoded inputs
    first input is an NxD array, where N is the number of samples
    no_dims => Number of dimensions to reduce the data to.
    initial_dims => number of principle components to extract with PCA
    perplexity => 2^(shannon entropy). A fair dice with k sides has a perplexity of k.

    This is much faster and has a similar accuracy to the standard t-SNE
    PCA can be used to reduce the dimensionality before performing the clustering
        This speeds up the computation of pairwise distances and supresses some of the noise
'''
#Call the python wrapper for the c++ implementation
clusters, positions = run_bh_tsne(X,
                                  no_dims=2,
                                  perplexity=perplexity,
                                  verbose=True,
                                  initial_dims=50,
                                  use_pca=False,
                                  max_iter=max_iter)

print("position shape", positions[0].shape)
X_iter = np.dstack(position.reshape(-1, 2) for position in positions)

f, ax, sc, txts = scatter(X_iter[..., -1], y)


def make_frame_mpl(t):
    i = int(t * 40)
    x = X_iter[..., i]
    sc.set_offsets(x)
    for j, txt in zip(range(10), txts):
Пример #7
0
"""

# query = """
# select vals.cartodb_id,geom.the_geom,
# geom.the_geom_webmercator,
# vals.total_pop,
# vals.bachelors_degree,
# vals.associates_degree,
# black_pop,
# white_pop,
# asian_pop,
# median_income
# from obs_fcd4e4f5610f6764973ef8c0c215b2e80bec8963 as geom,
# obs_b393b5b88c6adda634b2071a8005b03c551b609a as vals
# where geom.geoid=vals.geoid
# limit 100
# """

api_key = '893a45cc8505dfffe26d94b3c160a6fc1b1da459'
user = '******'

data = app.getCartoData(query, user, api_key)
print(data)
print(data.shape)
result = bhtsne.run_bh_tsne(data.as_matrix(),
                            initial_dims=data.shape[1],
                            verbose=True,
                            perplexity=25)
# result = app.calcTSNE(data)
data.assign(x=result[:, 0], y=result[:, 1]).to_csv('temp.csv', index=False)
# -- END

#check whether output file already exists
user_input = 'n'
if os.path.isfile(out_file):
    print('Output file already exists.')
    user_input = input('Do you want to replace file (y/n): ')
else:
    user_input = 'y'

if user_input == 'y':
    #load input data
    f = h5py.File(inputfile, 'r')  #r - read only
    modedata = np.array(f['stackedmodes'])
    f.close()
    print('Size of data: ' + str(modedata.shape))

    #perform tsne algorithm
    #modedata=modedata.astype('float64')
    space = bhtsne.run_bh_tsne(modedata,
                               verbose=True,
                               perplexity=perplexity,
                               initial_dims=5 * 10,
                               max_iter=5000)

    #Save the result
    f = h5py.File(out_file, 'w')
    tsne_s = f.create_dataset('space', (space.shape))
    tsne_s[...] = space
    f.close()
Пример #9
0
                (x_train, x_test), (y_train,
                                    y_test) = mnist.load_fashion_mnist_data(
                                        False,
                                        len_sample=70000,
                                        train_test_split=argp.freeze_index)

            # now, embed the train data:
            # load default initial gaussian embedding
            #_initial_embedding = get_initial_embedding(data_name=x_train, method_name="gaussian", i=1)
            if exact:
                """
                pass an additional theta=0.0 if running exact tSNE
                """
                # use initial embedding
                bh_tsne_dict = bhtsne.run_bh_tsne(x_train,
                                                  verbose=True,
                                                  initial_solution=None,
                                                  theta=0.0)

                # save results
                # timestamp
                timestamp = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
                bhtsne.write_bh_tsne_result(
                    bh_tsne_dict,
                    os.path.join(RESULT_DIR, algorithm_dir,
                                 FREEZE_INITIAL_DIR), "-", timestamp)
            else:
                # use initial embedding
                bh_tsne_dict = bhtsne.run_bh_tsne(x_train,
                                                  verbose=True,
                                                  initial_solution=None)
Пример #10
0
initialDims = 21
perplexity = 30
theta = .5
alg = 'svd'

def load_data(data_set_name, data_set_type):
    dataBasePath = '/home/dev/data/numer.ai/' + data_set_name + '/'
    return np.load(dataBasePath + data_set_type + '-' + data_set_name + '.npy')


data_set_name = '2016-09-08'

X = load_data(data_set_name, 'features')
Y = load_data(data_set_name, 'labels')

X = np.reshape(X, (-1, 21))
Y = np.reshape(Y, (-1))
print(np.shape(X))
print(np.shape(Y))

# no_dims=2, perplexity=50, theta=0.5, randseed=-1, verbose=False,
# initial_dims=50, use_pca=True, max_iter=1000,
map = run_bh_tsne(X[0:10000], no_dims=numDims, initial_dims=initialDims,
                  verbose=True, perplexity=perplexity, theta=theta,
                  usefile=False, array=X[0:10000])
# gscatter(map(:,1), map(:,2), Y);

plt.scatter(map[:, 0], map[:, 1], 20, ('b', 'g'))
plt.show()
plt.savefig('bhtsne.png')
Пример #11
0
    sparse_data = sparse.csr_matrix((pc_data, pc_indices, pc_indptr), pc_shape)

    print sparse_data.shape
    
    tsvd = TruncatedSVD(n_components = 20)

    transformed = tsvd.fit_transform(sparse_data)
else:
    data = np.log(1+pc_data_npz['X'])

    print pc_data_npz['genes'].shape, data.shape
    tsvd = TruncatedSVD(n_components = 20)

    transformed = tsvd.fit_transform(data)

    
N,D = transformed.shape
if sub:
    sub_sz = int(subsample*N)
    indices = np.random.choice(N, sub_sz, replace=False)
    transformed = transformed[indices,:]

np.savetxt(pcafile + '.txt', transformed)

embedded, betas = bhtsne.run_bh_tsne(transformed, initial_dims=transformed.shape[1], theta=0.3,
                                     verbose=True, perplexity=50, max_iter=1000, use_pca=False)

print embedded.shape, betas.shape
np.savetxt(outfile , embedded)
np.savetxt(betafile, betas)
Пример #12
0
def cluster_fTSNE(dataset,
                  low_filter=0.3,
                  no_dims=2,
                  perplexity=50,
                  use_pca=True,
                  initial_dims=50,
                  max_iter=500,
                  theta=0.5,
                  randseed=-1,
                  verbose=False):
    import numpy as np
    import sys
    import os
    import gc
    if type(dataset) == str and dataset == "help":
        print(
            "This function is used to do single-nonsingle patterns clustering using TSNE and kmeans"
        )
        print("    -> Input: dataset (numpy.ndarray, shape=(Nd,Nx,Ny)")
        print(
            "      option: low_filter (float 0~1, the percent of area at the frequency center that is used for clustering, default=0.3)"
        )
        print(
            "      option (TSNE): no_dims (+int, dimensions after decomposition, default=2)"
        )
        print(
            "      option (TSNE): perplexity (+int, perlexity value to evaluate P(i|j) in TSNE, default=50)"
        )
        print(
            "      option (TSNE): use_pca (bool, whether to use PCA to generate initiate features, default=True)"
        )
        print(
            "      option (TSNE): initial_dims (+int, output dimensions of inititate PCA, ignored if use_pca=False, default=50)"
        )
        print(
            "      option (TSNE): max_iter (+int, max iterations, default=1000, suggested >500)"
        )
        print(
            "      option (TSNE): theta (0~1 float, the speed vs accuracy trade-off parameter, theta=1 means highest speed, default=0.5)"
        )
        print(
            "      option (TSNE): randseed (int, >=0 use 'randseed' as initiate value's generating seed, <0 use current as random seed, default=-1)"
        )
        print("      option (TSNE): verbose (default=False)")
        print(
            "    -> Return: list, [data_after_decomposition, predicted_labels]"
        )
        print(
            "[Notice] The input dataset is not recommended to contain more than 5k patterns, but it's also neccessary to have more than 500 ones.\
You can split the original dataset into several parts and use multi-processors to deal with them."
        )
        print("Help End. Exit.")
        return
    sys.path.append(__file__.split("/image/classify.py")[0] + '/analyse')
    sys.path.append(os.path.join(os.path.dirname(__file__), 'bhtsne_source'))
    import saxs
    import radp
    no_dims = int(no_dims)
    initial_dims = int(initial_dims)
    max_iter = int(max_iter)
    theta = min(np.abs(theta), 1)
    rcenter = [
        int(dataset.shape[1] * low_filter / 2.0),
        int(dataset.shape[2] * low_filter / 2.0)
    ]
    # fft
    print("\nStart FFT analysis ...")
    dataset[np.where(dataset < 0)] = 0
    dataset[np.isnan(dataset)] = 0
    dataset[np.isinf(dataset)] = 0
    fdataset = np.zeros(dataset.shape)
    for ind, data in enumerate(dataset):
        fdataset[ind] = np.abs(np.fft.fftshift(np.fft.fft2(data)))
        sys.stdout.write("Processing " + str(ind) + "/" + str(len(dataset)) +
                         " ...\r")
        sys.stdout.flush()
    print("\nDone.")
    # normalization
    print("\nStart normalization ...")
    center_data = (fdataset.shape[1] / 2, fdataset.shape[2] / 2)
    fdataset = fdataset[:, center_data[0] - rcenter[0]:center_data[0] +
                        rcenter[0], center_data[1] -
                        rcenter[1]:center_data[1] + rcenter[1]]
    center_data = (fdataset.shape[1] / 2.0, fdataset.shape[2] / 2.0)
    saxs_data = saxs.cal_saxs(fdataset)
    saxs_intens = radp.radial_profile_2d(saxs_data, center_data)
    dataset_norm = np.zeros(fdataset.shape)
    for ind, pat in enumerate(fdataset):
        pat_normed = radp.radp_norm_2d(saxs_intens, pat, center_data)
        dataset_norm[ind] = pat_normed
        sys.stdout.write("Processing " + str(ind) + "/" + str(len(fdataset)) +
                         " ...\r")
        sys.stdout.flush()
    print("\nDone.")
    # decomposition
    print("\nStart decomposition using TSNE ...")
    dataset_norm.shape = (dataset_norm.shape[0],
                          dataset_norm.shape[1] * dataset_norm.shape[2])
    log_data_norm = np.log(1 + np.abs(dataset_norm))
    del dataset_norm
    del fdataset
    del saxs_data
    gc.collect()
    import bhtsne
    embedding_array = bhtsne.run_bh_tsne(log_data_norm,
                                         no_dims=no_dims,
                                         perplexity=perplexity,
                                         use_pca=use_pca,
                                         initial_dims=initial_dims,
                                         max_iter=max_iter,
                                         theta=theta,
                                         randseed=randseed,
                                         verbose=verbose)
    # clustering
    print("\nStart clustering ...")
    from sklearn import cluster
    centroid, label, inertia = cluster.k_means(embedding_array, 2)
    return embedding_array, label
Пример #13
0
import numpy as np
import bhtsne
from argparse import ArgumentParser, FileType
from os.path import abspath, dirname, isfile, join as path_join
from shutil import rmtree
from struct import calcsize, pack, unpack
from subprocess import Popen
from sys import stderr, stdin, stdout
from tempfile import mkdtemp
from platform import system
from os import devnull
import numpy as np
import os, sys
import io

mnist_path = '/Users/jiadao/PycharmProjects/Py3/data visulization/tsne/bhtsne-master/mnist2500_X.txt'
data = np.loadtxt(mnist_path, skiprows=1)

print('FINISHED LOADING')
embedding_array = bhtsne.run_bh_tsne(data)
    for i in range(len(h5filelist)):
        f=h5py.File(inputfolder + '/' + h5filelist[i],'r') #r - read only
        loadedthetas=np.array(f['thetas'])
        f.close()
        #matrix with models x 16 entries
        #reshape attaches [0,0,:],[0,1,:],...
        usethetas=np.reshape(loadedthetas[:,1:5,:],
                             (loadedthetas.shape[0],16,1))[:,:,0]
        entrynumber[i]=usethetas.shape[0]; #number of linear models in file
        if i==0:
            allthetas=np.copy(usethetas)
        else: #stitches all together
            allthetas=np.vstack((allthetas, usethetas))
    print('Size of data: '+str(allthetas.shape))
    
    #perform tsne algorithm
    allthetas=allthetas.astype('float64')
    pca_dim=14
    space=bhtsne.run_bh_tsne(allthetas,verbose=True,perplexity=perplexity,
                             initial_dims=pca_dim,max_iter=5000)

    #Save the result
    f=h5py.File(out_file,'w')
    entrynumbers=f.create_dataset('entrynumber',(entrynumber.shape))
    entrynumbers[...]=entrynumber
    alltheta=f.create_dataset('allthetas',(allthetas.shape))
    alltheta[...]=allthetas
    tsne_s=f.create_dataset('space',(len(allthetas),2),maxshape=(None,None))
    tsne_s.resize(space.shape)
    tsne_s[...]=space
    f.close()
Пример #15
0
    trainer.close()

    # Draw the pic
    # Normalize
    weights /= np.sqrt(np.sum(weights**2, axis=1, keepdims=True))
    embeddings_val /= np.sqrt(np.sum(embeddings_val**2, axis=1, keepdims=True))

    # We only get the weights we need
    index2center = OrderedDict()
    for i in range(num_samples):
        if labels_val[i] not in index2center:
            index2center[labels_val[i]] = weights[labels_val[i], :]

    weights_new = []
    weights_index = []
    for index in index2center:
        weights_index.append(index)
        weights_new.append(index2center[index])
    weights_new = np.stack(weights_new, axis=0)
    num_weights = len(weights_index)

    # tSNE
    combined = np.concatenate([weights_new, embeddings_val], axis=0)
    Y = run_bh_tsne(combined, no_dims=2, initial_dims=50)
    Y_weights = Y[:num_weights, :]
    Y_embeddings = Y[num_weights:, :]
    plt.figure(1)
    plt.scatter(Y_embeddings[:, 0], Y_embeddings[:, 1], c=labels_val)
    plt.scatter(Y_weights[:, 0], Y_weights[:, 1], marker="x")
    plt.savefig(args.embedding_pic)
Пример #16
0
def tsne_workflow(parameter_name,
                  value_list,
                  data,
                  result_base_dir,
                  data_result_subdirectory,
                  initial_embedding_method=None,
                  **kwargs):
    """

    :param parameter_name:
    :param value_list:
    :param data:
    :param result_base_dir:
    :param data_result_subdirectory:
    :param initial_embedding_method:
    :return:

    """

    for value in value_list:
        print("###########################################")
        print("##              Start t-SNE              ##")
        print("###########################################")

        print("Using Dataset: {}".format(data_result_subdirectory))

        print("Tuning parameter: " + parameter_name + ", value: " + str(value))
        # 5 times to validate for random methods, once for specified inputs

        max_round = 6 if initial_embedding_method in ['gaussian', 'random'
                                                      ] else 2

        for i in range(1, max_round):
            print("###", "### Round:" + str(i), "###")
            # create directory if non-existent
            result_dir = os.path.join(result_base_dir, str(value),
                                      data_result_subdirectory, str(i))
            try:
                os.makedirs(result_dir)
            except FileExistsError:
                # directory already exists
                pass

            # load the initial embedding if specified
            _initial_embedding = None
            if initial_embedding_method is not None:
                _initial_embedding = get_initial_embedding(
                    data_name=data_result_subdirectory,
                    method_name=initial_embedding_method,
                    i=i)
                filename = "initial_solution_" + data_result_subdirectory + "_" + initial_embedding_method  \
                           + "{}" + ".pickle"
                filename = filename.format("_" +
                                           str(i) if initial_embedding_method
                                           in ['random', 'gaussian'] else "")

                print("Using initial embedding file: {}".format(filename))

            # run t-SNE
            # perform PCA to 50 dims beforehand
            # use initial embedding
            bh_tsne_dict = bhtsne.run_bh_tsne(
                data,
                verbose=True,
                initial_solution=_initial_embedding,
                **{parameter_name: value},
                **kwargs)

            # save results
            # timestamp
            timestamp = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
            bhtsne.write_bh_tsne_result(bh_tsne_dict, result_dir, "-",
                                        timestamp)
Пример #17
0
import numpy as np
import bhtsne

np.random.seed(1137)
X = np.random.rand(500, 10)
proj, betas, cpp, cpi = bhtsne.run_bh_tsne(X,
                                           verbose=False,
                                           randseed=1137,
                                           return_betas=True,
                                           return_cost_per_point=True,
                                           return_cost_per_iter=True)

assert betas.shape[0] == 500
assert cpp.shape[0] == 500
assert cpi.shape[0] == 1000
assert proj[0, 0] == -7.235696435669544
Пример #18
0
import numpy as np
import bhtsne
from sklearn.decomposition import PCA

# data = np.loadtxt('../example_data/pollen.txt',delimiter=',').T

# data = np.log(1+data)

# data = data - np.mean(data, axis=1, keepdims=True)
# data = data/(np.sum(data**2, axis=1, keepdims=True))**.5


# pca = PCA(n_components=50)

# pc_data = pca.fit_transform(data)

pc_data = np.loadtxt('gaussian_density_overlap.txt').T

print(pc_data.shape)

embedded = bhtsne.run_bh_tsne(pc_data, initial_dims=pc_data.shape[1], theta=0., verbose=True,
                              perplexity=30)

np.savetxt('orig_overlap_out.txt', embedded)
Пример #19
0
def multi_run_wrapper(args):
    projectionsAllLoc, betasL, cppL, cpiL = bhtsne.run_bh_tsne(*args)
    return projectionsAllLoc, betasL, cppL, cpiL
Пример #20
0
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k for tsne_data')
plt.savefig("figures/elbowmethodt.png")
#plt.show()

## Cluster dimension-reduced data with KMeans, with n_cluster equal to k

K = 2
km = KMeans(n_clusters=K, init='k-means++', n_init=100)
km.fit(train)
x = km.fit_predict(train)

embedding_array = bhtsne.run_bh_tsne(train,
                                     no_dims=2,
                                     perplexity=4,
                                     initial_dims=train.shape[1],
                                     verbose=True)
tsne_data = pd.DataFrame(embedding_array[0:, 0:, ])

# plot tsne_dat
tsne_data["cluster"] = x
tsne_data = tsne_data.sort_values('cluster')
print("tsne-data")
print(tsne_data)
color_list = []
for cluster in tsne_data['cluster']:
    if cluster == 0:
        color_list.append('red')
    if cluster == 1:
        color_list.append('green')