예제 #1
0
파일: python2_tsne.py 프로젝트: amsqr/hd
def extract_tsne_gather_feat(stage):
    """
    Extract tsne gather features.
    Note: python2 only.    
    Better than func:extract_tsne_feat in cv, but worst in submission.
    """  
    df_w2vlem_join = pd.read_csv('tmp2/df_w2vlem_join.csv', index_col=0)
        
    if stage <= 1:        
        df_feat = pd.DataFrame(index=df_w2vlem_join.index.values)
        tfidf = TfidfVectorizer(ngram_range=(2,4), stop_words='english', min_df=2)
        
        df_w2vlem_join['t_w2v'].to_csv('tmp2/t_w2v', index=False)
        df_w2vlem_join['q_w2v'].to_csv('tmp2/q_w2v', index=False)
        df_w2vlem_join['d_w2v'].to_csv('tmp2/d_w2v', index=False)
        
        tfidf.set_params(input='filename')        
        tfidf.fit(['tmp2/t_w2v','tmp2/q_w2v','tmp2/d_w2v'])
        tfidf.set_params(input='content')
        
        cPickle.dump(tfidf, open('tmp2/tfidf_obj','wb'))
    
    tfidf = cPickle.load(open('tmp2/tfidf_obj','rb'))
    X_t = tfidf.transform(df_w2vlem_join['t_w2v'].tolist())    
    if stage <= 2:           
        svd = TruncatedSVD(n_components=100, random_state=2016)     
        X_svd = svd.fit_transform(X_t)
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = bh_sne(X_scaled)
        df_feat['tsne_t_1'] = X_tsne[:len(df_w2vlem_join), 0]
        df_feat['tsne_t_2'] = X_tsne[:len(df_w2vlem_join), 1]
        df_feat.to_csv('tmp2/tsne_t', index=False)
    
    df_feat = pd.read_csv('tmp2/tsne_t')    
    if stage <= 3:
        print(df_feat)
        X_q = tfidf.transform(df_w2vlem_join['q_w2v'].tolist())
        X_tq = sp.hstack([X_t, X_q]).tocsr()
        svd = TruncatedSVD(n_components=50, random_state=2016)
        X_svd = svd.fit_transform(X_tq)
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = bh_sne(X_scaled)
        df_feat['tsne_qt_1'] = X_tsne[:len(df_w2vlem_join), 0]
        df_feat['tsne_qt_2'] = X_tsne[:len(df_w2vlem_join), 1]
        df_feat.to_csv('tmp2/tsne_qt', index=False)
    
    df_feat = pd.read_csv('tmp2/tsne_qt')    
    if stage <= 4:
        print(df_feat)    
        X_d = tfidf.transform(df_w2vlem_join['d_w2v'].tolist())
        svd = TruncatedSVD(n_components=100, random_state=2016)
        X_svd = svd.fit_transform(X_d)
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = bh_sne(X_scaled)
        df_feat['tsne_desc_1'] = X_tsne[:len(df_w2vlem_join), 0]
        df_feat['tsne_desc_2'] = X_tsne[:len(df_w2vlem_join), 1]
        
        df_tsne_feats = df_feat
        df_tsne_feats.to_csv('tmp2/df_tsne_gather_feats.csv')
예제 #2
0
def test_seed():
    from tsne import bh_sne
    from sklearn.datasets import load_iris
    import numpy as np

    iris = load_iris()

    X = iris.data
    y = iris.target

    t1 = bh_sne(X, random_state=np.random.RandomState(0), copy_data=True)
    t2 = bh_sne(X, random_state=np.random.RandomState(0), copy_data=True)
    assert np.all(t1 == t2)
예제 #3
0
def meta_pca_sne(exID, experiment_folder):  # put exID back

    plot_subfolder = experiment_folder + "/meta_pca"
    plot_data_directory = check_create_directory(plot_subfolder)
    filename = "{}/META".format(plot_data_directory)

    # mongo stuff
    dbClient = DatabaseClient()

    filteredResults = dbClient.query(exID)

    if filteredResults is None:
        print "No results"
        return

    filteredId = filteredResults[0]['_id']
    experiment = dbClient.get(filteredId)

    list_of_coords = experiment['DATA']['TSNE_DATA']

    np_list = np.asarray(list_of_coords)
    print "META shape: ", np_list.shape

    epochs = experiment['DATA']['EPOCH']
    layers = experiment['DATA']['LAYER']

    labels = []
    no_samples = len(epochs)
    for i in range(no_samples):
        labels.append(epochs[i] + (layers[i] * 0.1))
        # labels.append(epochs[i])

    labels = np.asarray(labels)
    labels = labels[:500]

    np_list = np_list[:, :500]

    # print "LIST", np_list
    # print "list size:", np_list.shape
    perp = 10.0
    no_data_shape = np_list.shape[0]
    if (((perp / 3.0) - 1.0) < no_data_shape):
        perp = (no_data_shape / 3.0) - 1.0
    sne_co = bh_sne(np_list, perplexity=perp, theta=0.5)

    print "sne", sne_co.shape
    print "labels", labels

    plt.scatter(sne_co[:, 0], sne_co[:, 1], c=labels)
    plt.savefig(filename, dpi=120)
    plt.close()
    # plt.show()

    print "show"
    flat_coords = np.reshape(sne_co, (1, -1))
    flat_coords = flat_coords.tolist()[0]

    experiment['DATA']['META'] = flat_coords

    updatedObject = dbClient.update(filteredId, experiment)
예제 #4
0
def getTsne(modelFile, outDir, NBOW2=True):
    pp = numpy.load(modelFile) 
    wv = pp['Wemb'].copy()

    sklearn_pca = PCA(n_components=50)
    Y_sklearn = sklearn_pca.fit_transform(wv)
    Y_sklearn = numpy.asfarray( Y_sklearn, dtype='float' )

    print "PCA transformation done ..."
    print "Waitig for t-SNE computation ..."
    
    reduced_vecs = bh_sne(Y_sklearn)

    with open(outDir + "/tsne", "w") as out:
        for i in range(len(reduced_vecs)):
            out.write(str(reduced_vecs[i,0]) + " " + str(reduced_vecs[i,1]) + "\n")
    out.close

    print "t-SNE written to file ..."
    
    if NBOW2:
        av = pp['AVs'].astype('float64').T[0]
        wts =[]
        for i in range(len(wv)):
            wt = sigmoid(numpy.dot(wv[i],av))
            wts.append(wt)
        with open(outDir + "/wts", "w") as out:
            for i in range(len(wts)):
                out.write(str(wts[i]) + "\n")
        out.close
예제 #5
0
    def fit_transform(self, X):
        """Perform both a fit and a transform on the input data

        Fit the data to the reduction algorithm, and transform the data to
        the reduced space.

        Parameters
        ----------
        X : pandas.DataFrame
            A (n_samples, n_features) dataframe to both fit and transform

        Returns
        -------
        self : DataFrameReducerBase
            A fit and transformed instance of the object

        Raises
        ------
        ValueError
            If the input is not a pandas DataFrame, will not perform the fit
            and transform

        """
        from tsne import bh_sne

        self._check_dataframe(X)
        return pd.DataFrame(bh_sne(X), index=X.index)
def visualize(x_data, y_data, y_name):
    # convert image data to float64 matrix. float64 is need for bh_sne
    x_data = np.asarray(x_data).astype('float64')
    y_data = np.asarray(y_data).astype('int')
    y_name = np.asarray(y_name)
    x_data = x_data.reshape((x_data.shape[0], -1))

    # perform t-SNE embedding
    vis_data = bh_sne(x_data)

    # plot the result
    vis_x = vis_data[:, 0]
    vis_y = vis_data[:, 1]

    fig, ax = plt.subplots()

    almost_black = '#262626'
    # set2 = brewer2mpl.get_map('Set3', 'qualitative', 10).mpl_colors
    set2 = plt.cm.Set3(np.linspace(0, 1, 10))

    for class_i in range(10):
        idx = np.where(y_data == class_i)[0]
        # print(idx)
        color = set2[class_i]
        # print('label=%s' % y_name[y])
        plt.scatter(vis_x[idx], vis_y[idx], label=y_name[class_i], alpha=0.9, edgecolor=almost_black, linewidth=0.15, facecolor=color)#s=0.5, cmap=plt.cm.get_cmap("jet", 10))
    # plt.colorbar(ticks=range(10))
    ax.legend(loc=1)
    ax.grid(True)

    plt.clim(-0.5, 9.5)
    plt.show()
예제 #7
0
def getTsne(modelFile, outDir, NBOW2=True):
    pp = numpy.load(modelFile)
    wv = pp['Wemb'].copy()

    sklearn_pca = PCA(n_components=50)
    Y_sklearn = sklearn_pca.fit_transform(wv)
    Y_sklearn = numpy.asfarray(Y_sklearn, dtype='float')

    print "PCA transformation done ..."
    print "Waitig for t-SNE computation ..."

    reduced_vecs = bh_sne(Y_sklearn)

    with open(outDir + "/tsne", "w") as out:
        for i in range(len(reduced_vecs)):
            out.write(
                str(reduced_vecs[i, 0]) + " " + str(reduced_vecs[i, 1]) + "\n")
    out.close

    print "t-SNE written to file ..."

    if NBOW2:
        av = pp['AVs'].astype('float64').T[0]
        wts = []
        for i in range(len(wv)):
            wt = sigmoid(numpy.dot(wv[i], av))
            wts.append(wt)
        with open(outDir + "/wts", "w") as out:
            for i in range(len(wts)):
                out.write(str(wts[i]) + "\n")
        out.close
    def api_function(self, *params):
        if params[0]:
            self.t = params[0]

        self.t_root = os.path.join(self.data_root, self.t, "imgs")
        self.t_label = os.path.join(self.data_root, self.t, "label.txt")
        t_set = DigitImage(self.t_root, self.t_label, transform=transforms.Compose([
            transforms.Resize((28, 28)),
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ]))
        self.train_loader = torch.utils.data.DataLoader(t_set, batch_size=self.batch_size, shuffle=True, **self.kwargs)

        self.generate_feature()

        output = np.load('train/output.npy').astype(np.float64)
        data = np.load('train/data.npy')
        target = np.load('train/target.npy')
        print('data shape: ', data.shape)
        print('target shape: ', target.shape)
        print('output shape: ', output.shape)

        output_2d = bh_sne(output)
        np.save('train/output_2d.npy', output_2d, allow_pickle=False)

        plt.rcParams['figure.figsize'] = 20, 20
        plt.scatter(output_2d[:, 0], output_2d[:, 1], c=target * 10)
        plt.savefig(os.path.join(self.result_root, self.t, "tsne.png"), bbox_inches='tight')

        return dict(
            data=Image.open(os.path.join(self.result_root, self.t, "tsne.png"))
        )
예제 #9
0
def meta_pca_sne(exID, experiment_folder): # put exID back
    
    plot_subfolder = experiment_folder + "/meta_pca"
    plot_data_directory = check_create_directory(plot_subfolder)
    filename = "{}/META".format(plot_data_directory)

    # mongo stuff
    dbClient = DatabaseClient()

    filteredResults = dbClient.query(exID)

    if filteredResults is None:
      print "No results"
      return

    filteredId = filteredResults[0]['_id']
    experiment = dbClient.get(filteredId)

    list_of_coords = experiment['DATA']['TSNE_DATA']

    np_list = np.asarray(list_of_coords)
    print "META shape: ", np_list.shape
    
    epochs = experiment['DATA']['EPOCH']
    layers = experiment['DATA']['LAYER']

    labels = []
    no_samples = len(epochs)
    for i in range(no_samples):
      labels.append(epochs[i] + (layers[i]*0.1))
      # labels.append(epochs[i])
    
    labels  = np.asarray(labels)
    labels = labels[:500]

    np_list = np_list[:,:500]

    # print "LIST", np_list
    # print "list size:", np_list.shape
    perp = 10.0
    no_data_shape = np_list.shape[0]
    if (((perp / 3.0) - 1.0) < no_data_shape):
      perp = (no_data_shape / 3.0) - 1.0
    sne_co = bh_sne(np_list, perplexity=perp, theta=0.5)

    print "sne", sne_co.shape
    print "labels", labels

    plt.scatter(sne_co[:,0], sne_co[:,1], c=labels)
    plt.savefig(filename, dpi=120)
    plt.close()
    # plt.show()

    print "show"
    flat_coords = np.reshape(sne_co, (1,-1))
    flat_coords = flat_coords.tolist()[0]

    experiment['DATA']['META'] = flat_coords

    updatedObject = dbClient.update(filteredId, experiment)
예제 #10
0
def t_sne(obj):

	p = parser()
	data_categories = {}
	label_categories = {}

	for d in obj:
		for c in p.categories_item(d):
			if c not in data_categories:
				data_categories[c] = []
				label_categories[c] = []
			
			data_categories[c].append(d[1:])
			label_categories[c].append('g' if d[0] == 1 else 'r')			
	
	print len(data_categories)
	for c in data_categories:
		print '------------------------'
		print '%s (%d)' % (c, len(data_categories[c]))
		print '------------------------'
		if len(data_categories[c]) > 100:
			t_sne(data_categories[c], label_categories[c])
		else:
			print 'small dimensionality'

	arr = np.array(data_categories, dtype=np.float64)
	x2 = bh_sne(arr)
	plt.scatter(x2[:, 0], x2[:, 1], c=label_categories)
	plt.show()
예제 #11
0
def t_sne_vis(name, base_model, x_processed_images, random_state, labels):
    """
    :param name: the name of the cnn model used to build features
    :param base_model: the model obj
    :param x_processed_images: the input images for our model
    :param random_state: for fixing the results
    :param labels: 0/1 classification labels
    :return:
    the graph of image distribution based on features extracted from the model and the t-sne features
    """
    # convert data to images
    print('Converting data points to composite image')
    X_train = x_processed_images
    print('we got %d different images of shape %dx%d ' %
          (len(X_train), X_train.shape[1], X_train.shape[1]))
    print('build usefull features from the selected model')
    features = base_model.predict(X_train)
    x_data1 = np.asarray(features).astype('float64')
    x_data1 = x_data1.reshape((x_data1.shape[0], -1))
    # perform t-SNE embedding
    print('performing t-sne reduction')
    vis_data = bh_sne(x_data1, random_state=random_state)
    # plot the result
    fig = plt.figure(figsize=(15, 15))
    vis_x = vis_data[:, 0]
    vis_y = vis_data[:, 1]
    plt.scatter(vis_x, vis_y, c=labels, cmap=plt.cm.get_cmap("winter", 2))
    plt.colorbar(ticks=range(2))
    plt.clim(0, 1)
    plt.title(name)
    plt.grid()
    plt.show()
    fig.savefig('tsne_vis_' + name + '.png')
    return vis_data
예제 #12
0
def project_vectors(X_in, model='tsne', perp=10, n_components=2):
    if model == 'tsne':
        from tsne import bh_sne
        X_in = X_in.reshape((X_in.shape[0], -1)).astype('float64')
        if perp is not None:
            X_out = bh_sne(X_in, perplexity=perp)
        else:
            X_out = bh_sne(X_in)
    elif model == 'pca':
        pca = PCA(n_components=n_components, whiten=True)
        pca.fit(X_in)
        X_out = pca.transform(X_in)
    else:
        raise NotImplementedError

    return X_out
def do_tsne():
    cell_accum, cell_status = files_to_cells(files)
    feats_accum = []
    for i, (x, status) in enumerate(zip(cell_accum, cell_status)):
        feats_accum.append(features(x))

    feats_accum = np.asarray(feats_accum)
    cell_status = np.asarray(cell_status)

    from tsne import bh_sne
    points = bh_sne(feats_accum.astype("float64"))

    for c, p in zip(category10, plts):
        mask = np.zeros_like(points[:, 0])
        print p
        for pi in p:
            print pi, dd[pi]
            mask[cell_status == dd[pi]] = 1
        print np.sum(mask)
        mask = (mask == 1)
        plt.plot(points[:, 0][mask],
                 points[:, 1][mask],
                 "o",
                 c=c,
                 lw=0,
                 alpha=0.5)

    #plt.legend([" ".join(p) for p in plts])
    plt.legend(["Healthy", "Septic", "Non-Septic"])
    plt.show()
예제 #14
0
def tsne_fit_transform(data, perplexity=50.0, nsvd=30):
    if nsvd > 0:
        svd = TruncatedSVD(n_components=nsvd)
        data = svd.fit_transform(data)
    data = StandardScaler().fit_transform(data)
    data = bh_sne(data, perplexity=perplexity)
    return data
예제 #15
0
    def fit_transform(self, X):
        """Perform both a fit and a transform on the input data

        Fit the data to the reduction algorithm, and transform the data to
        the reduced space.

        Parameters
        ----------
        X : pandas.DataFrame
            A (n_samples, n_features) dataframe to both fit and transform

        Returns
        -------
        self : DataFrameReducerBase
            A fit and transformed instance of the object

        Raises
        ------
        ValueError
            If the input is not a pandas DataFrame, will not perform the fit
            and transform

        """
        from tsne import bh_sne

        self._check_dataframe(X)
        return pd.DataFrame(bh_sne(X), index=X.index)
예제 #16
0
def test_seed():
    import numpy as np
    from sklearn.datasets import load_iris

    from tsne import bh_sne

    iris = load_iris()

    X = iris.data
    # y = iris.target

    t1 = bh_sne(X, random_state=np.random.RandomState(0), copy_data=True)
    t2 = bh_sne(X, random_state=np.random.RandomState(0), copy_data=True)

    assert t1.shape[0] == 150
    assert t1.shape[1] == 2
    assert np.all(t1 == t2)
예제 #17
0
 def do_bhsne():
     return bh_sne(
         data=X,
         d=embed_dimensions,
         perplexity=perplexity,
         random_state=random_state,
         **method_kwargs,
     )
예제 #18
0
def tSNE(x_data):

    vis_data = bh_sne(x_data)  # tsne embedding
    vis_x = vis_data[:, 0]
    vis_y = vis_data[:, 1]

    plt.scatter(vis_x, vis_y, c='black')
    plt.show()
    '''
def visualize(vecs):
    print "Got the vectors, now doing dimesnion reduction..."
    reduced = bh_sne(vecs)
    print "Reduction done, now plotting: "

    for i in range(len(reduced)):
        plt.plot(vecs[i,0], vecs[i,1], marker='o', markersize=8)

    plt.show()
예제 #20
0
파일: snippet.py 프로젝트: szabo92/gistable
def image_scatter(features, images, img_res, res=4000, cval=1.):
    """
    Embeds images via tsne into a scatter plot.

    Parameters
    ---------
    features: numpy array
        Features to visualize

    images: list or numpy array
        Corresponding images to features. Expects float images from (0,1).

    img_res: float or int
        Resolution to embed images at

    res: float or int
        Size of embedding image in pixels

    cval: float or numpy array
        Background color value

    Returns
    ------
    canvas: numpy array
        Image of visualization
    """
    features = np.copy(features).astype('float64')
    images = [gray_to_color(image) for image in images]
    images = [min_resize(image, img_res) for image in images]
    max_width = max([image.shape[0] for image in images])
    max_height = max([image.shape[1] for image in images])

    f2d = bh_sne(features)

    xx = f2d[:, 0]
    yy = f2d[:, 1]
    x_min, x_max = xx.min(), xx.max()
    y_min, y_max = yy.min(), yy.max()
    # Fix the ratios
    sx = (x_max - x_min)
    sy = (y_max - y_min)
    if sx > sy:
        res_x = sx / float(sy) * res
        res_y = res
    else:
        res_x = res
        res_y = sy / float(sx) * res

    canvas = np.ones((res_x + max_width, res_y + max_height, 3)) * cval
    x_coords = np.linspace(x_min, x_max, res_x)
    y_coords = np.linspace(y_min, y_max, res_y)
    for x, y, image in zip(xx, yy, images):
        w, h = image.shape[:2]
        x_idx = np.argmin((x - x_coords)**2)
        y_idx = np.argmin((y - y_coords)**2)
        canvas[x_idx:x_idx + w, y_idx:y_idx + h] = image
    return canvas
예제 #21
0
def reduce_features_dim(word2vec_model):
    """
	- Reduces features dimensionality using t-sne
	"""

    word_vectors = word2vec_model.syn0
    word_vectors = word_vectors.astype('float64')

    return bh_sne(word_vectors)
def perform_tsne_transformation(X):
	######### There is a bug in scikit-learn, hence cant do tsne with it. ##############
	# tsne_model = TSNE(n_components=2,random_state=0)
	# X_new = tsne_model.fit_transform(X)

	X = np.asarray(X).astype('float64')
	X = X.reshape((X.shape[0],-1))
	X_new = bh_sne(X,perplexity=5)
	return X_new
def tsne(embedding, word_2_id, sample_size = 1000):
    embedding_2d = bh_sne(embedding.astype(np.float64))
    keys = random.sample(word_2_id.keys(), sample_size)

    fig, ax = plt.subplots()
    for k in keys:
        id = word_2_id[k]
        ax.annotate(k, (embedding_2d[id, 0], embedding_2d[id, 1]))
    plt.show()
예제 #24
0
def perform_tsne_transformation(X):
    ######### There is a bug in scikit-learn, hence cant do tsne with it. ##############
    # tsne_model = TSNE(n_components=2,random_state=0)
    # X_new = tsne_model.fit_transform(X)

    X = np.asarray(X).astype('float64')
    X = X.reshape((X.shape[0], -1))
    X_new = bh_sne(X, perplexity=5)
    return X_new
예제 #25
0
파일: python2_tsne.py 프로젝트: amsqr/hd
def extract_tsne_feat():
    """
    Extract tsne features.
    Note: python2 only.    
    """  
    df_w2vlem_join = pd.read_csv('tmp2/df_w2vlem_join.csv', index_col=0)
         
    df_feat = pd.DataFrame(index=df_w2vlem_join.index.values)
    tfidf = TfidfVectorizer(ngram_range=(1,4), stop_words='english', min_df=2) 
    X_t = tfidf.fit_transform(df_w2vlem_join['t_w2v'].tolist())    
     
    svd = TruncatedSVD(n_components=100, random_state=2016)     
    X_svd = svd.fit_transform(X_t)
    X_scaled = StandardScaler().fit_transform(X_svd)
    X_tsne = bh_sne(X_scaled)
    df_feat['tsne_t_1'] = X_tsne[:len(df_w2vlem_join), 0]
    df_feat['tsne_t_2'] = X_tsne[:len(df_w2vlem_join), 1]
    df_feat.to_csv('tmp2/tsne_t', index=False)

    print(df_feat)
    tfidf = TfidfVectorizer(ngram_range=(1,4), stop_words='english', min_df=2) 
    X_q = tfidf.fit_transform(df_w2vlem_join['q_w2v'].tolist())
    X_tq = sp.hstack([X_t, X_q]).tocsr()
    svd = TruncatedSVD(n_components=100, random_state=2016)
    X_svd = svd.fit_transform(X_tq)
    X_scaled = StandardScaler().fit_transform(X_svd)
    X_tsne = bh_sne(X_scaled)
    df_feat['tsne_qt_1'] = X_tsne[:len(df_w2vlem_join), 0]
    df_feat['tsne_qt_2'] = X_tsne[:len(df_w2vlem_join), 1]
    df_feat.to_csv('tmp2/tsne_qt', index=False)

    df_feat = pd.read_csv('tmp2/tsne_qt')
    print(df_feat)    
    tfidf = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=2) 
    X_d = tfidf.fit_transform(df_w2vlem_join['d_w2v'].tolist())
    svd = TruncatedSVD(n_components=70, random_state=2016)
    X_svd = svd.fit_transform(X_d)
    X_scaled = StandardScaler().fit_transform(X_svd)
    X_tsne = bh_sne(X_scaled)
    df_feat['tsne_desc_1'] = X_tsne[:len(df_w2vlem_join), 0]
    df_feat['tsne_desc_2'] = X_tsne[:len(df_w2vlem_join), 1]
    
    df_tsne_feats = df_feat
    df_tsne_feats.to_csv('tmp2/df_tsne_feats.csv')
예제 #26
0
def plot_latent_space(X, y, file_name):
    """
    This function employs TSNE to convert the latent space to 2D space and plots the result.
    """
    X = tf.cast(X, dtype='float64')
    X = bh_sne(X)
    walking = X[y == 1]
    walking_up = X[y == 2]
    walking_down = X[y == 3]
    sitting = X[y == 4]
    standing = X[y == 5]
    laying = X[y == 6]

    colors = ['r', 'c', 'k', 'y', 'm', 'g']
    plt.figure(figsize=(12, 10))
    WALKING = plt.scatter(walking[:, 0],
                          walking[:, 1],
                          marker='x',
                          color=colors[0],
                          alpha=0.3)
    WALKING_UPSTAIRS = plt.scatter(walking_up[:, 0],
                                   walking_up[:, 1],
                                   marker='+',
                                   color=colors[1],
                                   alpha=0.3)
    WALKING_DOWNSTAIRS = plt.scatter(walking_down[:, 0],
                                     walking_down[:, 1],
                                     marker='^',
                                     color=colors[2],
                                     alpha=0.3)
    SITTING = plt.scatter(sitting[:, 0],
                          sitting[:, 1],
                          marker='o',
                          color=colors[3],
                          alpha=0.3)
    STANDING = plt.scatter(standing[:, 0],
                           standing[:, 1],
                           marker='o',
                           color=colors[4],
                           alpha=0.3)
    LAYING = plt.scatter(laying[:, 0],
                         laying[:, 1],
                         marker='o',
                         color=colors[5],
                         alpha=0.3)

    plt.legend((WALKING, WALKING_UPSTAIRS, WALKING_DOWNSTAIRS, SITTING,
                STANDING, LAYING),
               ('WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING',
                'STANDING', 'LAYING'),
               scatterpoints=1,
               loc='lower left',
               ncol=3,
               fontsize=8)
    plt.savefig(file_name + '.png')
    plt.show()
예제 #27
0
def test_iris():
    from tsne import bh_sne
    from sklearn.datasets import load_iris

    iris = load_iris()

    X = iris.data
    y = iris.target

    X_2d = bh_sne(X)
예제 #28
0
def tSNE(x_data, display_plot=False):

    vis_data = bh_sne(x_data)  # tsne embedding
    vis_x = vis_data[:, 0]
    vis_y = vis_data[:, 1]

    if display_plot:
        plt.scatter(vis_x, vis_y, c='black')
        plt.show()
    '''
def collect_and_save_plot_information(main_df, reduced_dim=2, perplexity=30, \
                                        n_kmeans_clusters=10):
    """
    Definition
        Reduces Dimensionality of Voltage Traces via t-SNE. Then applies
        K-means clustering. Also creates information strings for each datapoint
        containing its metadata. Finally saves it all in a pickle, for the
        script "display_interactive_plot.py" to use.
    """

    # dataframe for saving plotting values later on
    plot_df = pd.DataFrame(columns=["Value1", "Value2", "Color", "Label"])

    # create a numpy array for voltage trace values to apply dim reduction
    # and clustering
    n_channels = len(main_df['Conc_Trace'])
    n_trace_values = len(main_df['Conc_Trace'][0])
    data_array = np.zeros((n_channels, n_trace_values))

    # copy values into numpy array from dataframe
    for row in range(n_channels):
        data_array[row] = main_df['Conc_Trace'][row]
    """
    "we normalized each column by Z-scoring: we substracted its mean and
    then divided by its standard deviation"
    """
    for column in range(n_trace_values):
        data_array[:,column] = (data_array[:,column] - \
                                np.mean(data_array[:,column])) / \
                                np.std(data_array[:,column])

    # apply t-SNE on data
    data2d = bh_sne(data_array, d=reduced_dim, perplexity=perplexity)
    plot_df["Value1"] = data2d[:, 0]  # ... and save in the dataframe
    plot_df["Value2"] = data2d[:, 1]  # ... and save in the dataframe

    # apply kmeans and save its labels for colorization
    kmeans = KMeans(n_clusters=n_kmeans_clusters)
    kmeans.fit(data2d)
    labels = kmeans.labels_
    # for assigning different colors to different clusters
    nr_to_color = ["b", "g", "r", "c", "y", "m", "k", "fuchsia",        \
                    "gray", "navy", "coral"]
    # convert clusters to colors
    colors = [nr_to_color[cluster] for cluster in labels]
    plot_df["Color"] = colors  # ... and save in the dataframe

    # create list of labels, which will be displayed when clicking
    # on a datapoint
    text_list = [create_label_for_matplotlib(main_df.loc[counter])      \
    for counter in range(n_channels)]
    plot_df["Label"] = text_list  # ... and save in the dataframe

    # now that we got all we need, save in pickle
    plot_df.to_pickle("Interactive_Plot_Values.pickle")
def main(datafile, normalize, ndims, copula, clusteroutput, subsample):
    X, features = read_sah_h5(datafile)
    I, all_features = read_sah_h5(datafile, just_good=False)
    if 'id' in all_features:
        ids = X[:, all_features.index('id')]
    else:
        ids = np.arange(len(X)).astype(int)

    Xorig = X
    if normalize:
        mean = np.average(X, axis=0)
        std = np.std(X, axis=0)
        std[np.nonzero(std == 0.0)] = 1.0  # Avoid NaNs
        X = (X - mean) / std

    idx = np.random.randint(len(X), size=subsample)

    X = X[idx]
    ids = ids[idx]

    if copula:
        X = np.column_stack([copula_transform(x) for x in X.T])

    # I added this for the time/freq clustering
    # to emphasize the frequency feature
    # X[:, 1] *= 1e-3

    Y = bh_sne(X, d=ndims)

    dbscan = DBSCAN(eps=1.75, min_samples=5)
    C = dbscan.fit_predict(Y)

    tree = ExtraTreesClassifier(n_estimators=100)
    tree.fit(X, C)
    for f, i in zip(features, tree.feature_importances_):
        print '%s: %f' % (f, i)

    with open(clusteroutput, 'w+') as f:
        for c, i in zip(C, ids):
            f.write('%d,%d\n' % (i, c))

    pl.scatter(Y[:, 0],
               Y[:, 1],
               color=pl.cm.spectral(C.astype(float) / np.max(C)))

    for c in np.unique(C):
        pl.bar(0,
               0,
               lw=0,
               ec='none',
               fc=pl.cm.spectral(float(c) / np.max(C)),
               label='Cluster %d' % c)
    pl.legend()

    pl.show()
예제 #31
0
def _tsne(X, dir_str="*.wav", perplexity=3, plotting=False):
	"""
	Utility function to compute tsne
	"""
	flist = sorted(glob.glob(dir_str))
	Z = bh_sne(X, perplexity=perplexity)
	if plotting:
		figure()
		plot(Z[:,0], Z[:,1],'r.')
		[[text(p[0],p[1],'%s'%flist[i],fontsize=12) for i,p in enumerate(Z)]]
	return Z
예제 #32
0
def create_clusters_dbscan():
    np.random.seed(seed=12509234)

    df = c.import_data('../data/planets.csv')

    # Extract columns
    cols_phys = [
        'pl_orbper', 'pl_orbsmax', 'pl_orbeccen', 'pl_orbincl', 'pl_bmassj',
        'pl_radj', 'pl_dens', 'st_dist', 'st_optmag', 'st_teff', 'st_mass',
        'st_rad', 'st_logg', 'st_dens', 'st_lum', 'pl_rvamp', 'pl_eqt',
        'st_plx', 'st_age', 'st_vsini', 'st_acts'
    ]
    df_p = c.get_physical_columns(df, cols_phys)

    logcols = [
        'pl_bmassj', 'pl_dens', 'pl_orbper', 'pl_orbsmax', 'pl_radj',
        'st_dist', 'st_rad', 'st_teff', 'st_dens', 'pl_rvamp', 'st_plx',
        'st_vsini', 'st_acts'
    ]

    # Pre-process the data, apply lof to all columns
    km_labels, df_imputed = cl.kmeans_centroid_fill(df_p, 3, 10)

    # Create TSNE embedding
    vis_data_transit = bh_sne(df_imputed, perplexity=40)
    vis_x_transit = vis_data_transit[:, 0]
    vis_y_transit = vis_data_transit[:, 1]

    # Create a background plot of TSNE embedding
    fig = plt.figure(figsize=(12, 8))
    plt.scatter(vis_y_transit,
                vis_x_transit,
                c=['blue'],
                cmap=plt.cm.get_cmap("jet", 10),
                alpha=0.2)
    plt.savefig("../data/QC010_TSNE_background.png")

    # DBSCAN clustering
    X = np.array([vis_x_transit, vis_y_transit]).T
    dbs = DBSCAN(eps=2.1, min_samples=12)
    dbs.fit(X)

    # Generate clustering plot from TSNE
    n_clusters = len(np.unique(dbs.labels_))
    fig = plt.figure(figsize=(15, 12))
    plt.scatter(vis_y_transit,
                vis_x_transit,
                c=dbs.labels_,
                cmap=plt.cm.get_cmap("jet", n_clusters),
                alpha=1.0,
                s=10 * dbs.labels_ + 1)
    plt.colorbar(ticks=range(n_clusters))
    plt.clim(-0.5, n_clusters - 0.5)
    plt.savefig("../data/QC011_TSNE_clustering_w_sizes.png")
예제 #33
0
def image_scatter(features, images, img_res, res=4000, cval=1):
    """
    Embeds images via tsne into a scatter plot.
    Parameters
    ---------
    features: numpy array
        Features to visualize
    images: list or numpy array
        Corresponding images to features. Expects float images from (0,1).
    img_res: float or int
        Resolution to embed images at
    res: float or int
        Size of embedding image in pixels
    cval: float or numpy array
        Background color value
    Returns
    ------
    canvas: numpy array
        Image of visualization
    """
    features = np.copy(features).astype('float64')  #change type
    images = [gray_to_color(image)
              for image in images]  # convert to grey scale
    #images = [min_resize(image, img_res) for image in images]
    max_width = max([image.shape[0] for image in images])
    #max_height = max([image.shape[1] for image in images])

    f2d = bh_sne(features)  # docs: https://github.com/danielfrg/tsne
    # alternative: http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE

    xx = f2d[:, 0]
    yy = f2d[:, 1]
    x_min, x_max = xx.min(), xx.max()
    y_min, y_max = yy.min(), yy.max()
    # Fix the ratios
    sx = (x_max - x_min)
    sy = (y_max - y_min)
    if sx > sy:
        res_x = sx / float(sy) * res
        res_y = res
    else:
        res_x = res
        res_y = sy / float(sx) * res
    canvas = np.ones(
        (int(res_x + max_width), int(res_y + max_width), 3)) * cval
    x_coords = np.linspace(x_min, x_max, res_x)
    y_coords = np.linspace(y_min, y_max, res_y)
    for x, y, image in zip(xx, yy, images):
        #w, h = img_res
        x_idx = np.argmin((x - x_coords)**2)
        y_idx = np.argmin((y - y_coords)**2)
        canvas[x_idx:x_idx + 70, y_idx:y_idx + 70] = image
    return canvas
예제 #34
0
def make_multiple_cl_tsne(mat,
                          cmap_left=None,
                          cmap_right=None,
                          skl_version=True,
                          random_state=0,
                          learning_rate=40):
    from matplotlib import pyplot as plt
    import numpy as np

    # the matrix needs to be transposed in order to cluster the numbers
    x_data = mat.transpose()

    # convert image data to float64 matrix. float64 is need for bh_sne
    x_data = np.asarray(x_data).astype('float64')

    if skl_version == False:
        from tsne import bh_sne
        # perform t-SNE embedding, lowered perplexity
        vis_data = bh_sne(x_data, perplexity=7)
        vis_x = vis_data[:, 0]
        vis_y = vis_data[:, 1]

    else:
        from sklearn import manifold
        # run tsne from sklearn
        ###########################
        tsne = manifold.TSNE(perplexity=7,
                             n_iter=100000,
                             random_state=random_state,
                             method='exact',
                             metric='correlation',
                             learning_rate=learning_rate,
                             verbose=0,
                             n_iter_without_progress=1000,
                             init='random',
                             early_exaggeration=4)

        Y = tsne.fit_transform(x_data)
        vis_x = Y[:, 0]
        vis_y = Y[:, 1]

    fig, axarr = plt.subplots(ncols=2, figsize=(10, 5))

    marker_size = 150

    # always require cmap
    axarr[0].scatter(vis_x, vis_y, c=cmap_left, \
        cmap=plt.cm.get_cmap('prism',len(cmap_left)), s=marker_size)

    axarr[1].scatter(vis_x, vis_y, c=cmap_right, \
        cmap=plt.cm.get_cmap('jet',len(cmap_right)), s=marker_size)

    plt.show()
예제 #35
0
def plot_tsne(X_sample, y_sample):
    from tsne import bh_sne

    vis_data = bh_sne(X_sample)

    vis_x = vis_data[:, 0]
    vis_y = vis_data[:, 1]

    plt.scatter(vis_x, vis_y, c=np.argmax(y_sample, 1), cmap=plt.cm.get_cmap("jet", 10))
    plt.colorbar(ticks=range(10))
    plt.clim(-0.5, 9.5)
    plt.show()
예제 #36
0
    def _fit_transform(self, x_in):
        """ fit to data, and return the transform
        Args:
            x (numpy.array): Input numpy array

        Returns:
            x (numpy.array): Transformed array
        """

        x_in = x_in.astype(float)
        res = _tsne.bh_sne(x_in, perplexity=self.perplexity, theta=self.theta)
        return res
예제 #37
0
def run_tsne(transformed_pca_matrix,
             name='TSNE',
             key='TSNE',
             tsne_dims=None,
             input_pcs=None,
             perplexity=None,
             theta=None,
             max_iter=None,
             stop_lying_iter=None,
             mom_switch_iter=None,
             copy_data=False,
             random_state=None):

    if tsne_dims is None:
        tsne_dims = analysis_constants.TSNE_N_COMPONENTS

    if perplexity is None:
        perplexity = analysis_constants.TSNE_DEFAULT_PERPLEXITY

    if theta is None:
        theta = analysis_constants.TSNE_THETA

    if random_state is None:
        random_state = analysis_constants.RANDOM_STATE

    if max_iter is None:
        max_iter = analysis_constants.TSNE_MAX_ITER

    if stop_lying_iter is None:
        stop_lying_iter = analysis_constants.TSNE_STOP_LYING_ITER

    if mom_switch_iter is None:
        mom_switch_iter = analysis_constants.TSNE_MOM_SWITCH_ITER

    if input_pcs is not None:
        transformed_pca_matrix = transformed_pca_matrix[:, :input_pcs]

    # Make sure perplexity satisfies 'tsne' requirements
    N = transformed_pca_matrix.shape[0]
    perplexity = min(perplexity, max(1, -1 + float((N - 1)) / 3))

    transformed_tsne_matrix = tsne_bh.bh_sne(
        transformed_pca_matrix,
        d=tsne_dims,
        theta=theta,
        perplexity=perplexity,
        max_iter=max_iter,
        stop_lying_iter=stop_lying_iter,
        mom_switch_iter=mom_switch_iter,
        copy_data=copy_data,
        random_state=np.random.RandomState(random_state))

    return TSNE(transformed_tsne_matrix, name=name, key=key)
예제 #38
0
def run_BH_tSNE(table, do_pca=True):

	pca_dimensions = 50
	perplexity = 30.0

	logger.info("run_BH_tSNE: Running k-mer based binning...")
	# Note - currently doesn't handle cases where PCA dimensions and perplexity set too high

	# We make a submatrix, consisting of the contigs in the table
	k_mer_counts_submatrix = list()
	for i,row in table.iterrows():
		contig = row['contig']
		k_mer_counts_submatrix.append(k_mer_dict[contig])

	normalized_k_mer_submatrix = normalizeKmers(k_mer_counts_submatrix)

	# PCA

	if (len(normalized_k_mer_submatrix) > pca_dimensions) and (do_pca == True):
		logger.info('run_BH_tSNE: Principal component analysis')
		pca = decomposition.PCA(n_components=pca_dimensions)
		pca_matrix = pca.fit_transform(normalized_k_mer_submatrix)
	else:
		logger.info('run_BH_tSNE: Principle component analysis step skipped')

	# BH-tSNE
	logger.info('run_BH_tSNE: BH-tSNE')

	# Adjust perplexity according to the number of data points
	# Took logic from tsne source code
	if (len(normalized_k_mer_submatrix) - 1) < (3 * perplexity)  :
		perplexity = (float(len(normalized_k_mer_submatrix) - 1) / 3) - 1

	logger.info(str(len(normalized_k_mer_submatrix)) + ' data points')
	logger.info(str(len(normalized_k_mer_submatrix[0])) + ' dimensions')

	if (len(normalized_k_mer_submatrix) > pca_dimensions) and (do_pca == True):
		X = np.array(pca_matrix)
	else:
		X = np.array(normalized_k_mer_submatrix)
	bh_tsne_matrix = bh_sne(X, d=2, perplexity=perplexity, theta=0.5)

	# We will add bh_tsne_x and bh_tsne_y columns to the contig table

	bh_tsne_x = list()
	bh_tsne_y = list()

	for i in range(0, len(bh_tsne_matrix)):
		bh_tsne_x.append(bh_tsne_matrix[i][0])
		bh_tsne_y.append(bh_tsne_matrix[i][1])

	table['bh_tsne_x'] = pd.Series(bh_tsne_x, index = table.index)
	table['bh_tsne_y'] = pd.Series(bh_tsne_y, index = table.index)
def main():
    # parse arguments
    parser = argparse.ArgumentParser(
        description='Preprocess data using t-SNE.')
    parser.add_argument('input_data', type=str, help='Path to numpy data file')
    parser.add_argument('input_images_list',
                        type=str,
                        help='Path to text file listing images')
    parser.add_argument('--input_images_dir',
                        type=str,
                        default='public/data/',
                        help='Path to images folder')
    parser.add_argument('--output_file',
                        type=str,
                        default='public/data.csv',
                        help='Path to output CSV file')
    parser.add_argument('--max_num_points',
                        type=int,
                        default=1000,
                        help='Max number of points')
    args = parser.parse_args()

    # load data
    data_load = np.load(args.input_data)
    with open(args.input_images_list, 'r') as f:
        image_names_load = [l.strip() for l in f]

    # shuffle and reduce number of data points to run faster
    # this also results in cleaner visualization
    assert len(image_names_load) > 0
    assert data_load.shape[0] == len(image_names_load)
    indices = range(data_load.shape[0])
    random.shuffle(indices)

    data = np.zeros((args.max_num_points, data_load.shape[1]))
    image_names = []
    for i, rand_index in enumerate(indices):
        if i >= args.max_num_points:
            break
        data[i, :] = data_load[rand_index, :]
        image_names.append(image_names_load[rand_index])

    assert data.shape[0] == len(image_names), '{0} and {1}'.format(
        data.shape[0], len(image_names))

    # run dimensionality reduction with t-SNE
    data_tsne = bh_sne(data)
    xs = data_tsne[:, 0]
    ys = data_tsne[:, 1]

    # save to csv file
    save_to_csv(args.output_file, image_names, xs, ys)
예제 #40
0
def dim_reduction(main_df, reduced_dim=2, perplexity=30, n_kmeans_clusters=10):

    # create a numpy array for voltage trace values
    n_channels = len(main_df['Conc_Trace'])
    n_trace_values = len(main_df['Conc_Trace'][0])
    data_array = np.zeros((n_channels, n_trace_values))

    # copy values
    for row in range(n_channels):
        data_array[row] = main_df['Conc_Trace'][row]
    """
    "we normalized each column by Z-scoring: we substracted its mean and
    then divided by its standard deviation"
    """
    for column in range(n_trace_values):
        data_array[:,column] = (data_array[:,column] - \
                                np.mean(data_array[:,column])) / \
                                np.std(data_array[:,column])

    # apply t-SNE on data
    data2d = bh_sne(data_array, d=reduced_dim, perplexity=perplexity)

    # apply kmeans and save its labels for colorization
    kmeans = KMeans(n_clusters=n_kmeans_clusters)
    kmeans.fit(data2d)
    labels = kmeans.labels_

    # create list of labels, which will be displayed when clicking
    # on a datapoint
    text_list = [create_label_for_matplotlib(main_df.loc[counter])      \
    for counter in range(n_channels)]

    # plot
    plt.figure("Interactive Plot", figsize=(20, 10))
    plt.title("Interactive Plot of Channels, click on Datapoints for Info")

    # for assigning different colors to different clusters
    nr_to_color = ["b", "g", "r", "c", "y", "m", "k", "fuchsia",        \
                    "gray", "navy", "coral"]

    # plot every datapoint with its corresponding text ("label")
    # dont iterative, to assign labels correctly
    for dp in range(n_channels):
        plt.scatter(data2d[dp,0], data2d[dp,1], linewidths=0.1,         \
        c=nr_to_color[labels[dp]], label=text_list[dp])

    # add datacursor, basically enabling "clickability" of datapoints
    datacursor(formatter='{label}'.format, bbox=dict(fc='white'),       \
                 arrowprops=dict(arrowstyle='simple', fc='black', alpha=0.5))

    plt.show()
    return 0
예제 #41
0
def visualize_tsne():
	"""
	play around with tsne to visualize image space
	"""
	import matplotlib.pyplot as plt
	from tsne import bh_sne
	tracker_df = pd.read_pickle('./tracker.pkl')

	dfs = []
	for category in listdir('/Volumes/micro/recommend-a-graham/imgs/'):
		for user in listdir('/Volumes/micro/recommend-a-graham/imgs/'+category):
			img_ids = listdir('/Volumes/micro/recommend-a-graham/imgs/{}/{}/'.format(category, user))

			sub_df = tracker_df[tracker_df.img_id.apply(lambda x: x in img_ids)]

			# user_df = pd.read_pickle('../fc8_pkls/fc8_{}.pkl'.format(user))
			user_df = pd.read_pickle('../fc7_pkls/fc7_{}.pkl'.format(user))
			user_df = user_df[user_df.shortcode.apply(lambda x: x in sub_df.shortcode.values)]
			dfs.append(pd.merge(sub_df, user_df, on='shortcode'))

	dfs = pd.concat(dfs, axis=0)
	dfs.reset_index(inplace=True)
	# dfs.fc8 = dfs.fc8.apply(lambda x: x.reshape(1, x.shape[0]))
	dfs.fc7 = dfs.fc7.apply(lambda x: x.reshape(1, x.shape[0]))

	# vectors = dfs.fc8.values
	vectors = dfs.fc7.values

	x_data = vectors[0]
	for vector in vectors[1:]:
		x_data = np.concatenate((x_data, vector), axis=0)
	print x_data.shape

	y_dict = {k:i for i,k in enumerate(dfs.username.unique())}
	# y_dict = {k:i for i,k in enumerate(['cats', 'dogs', 'foodies',
	# 									'models','most_popular',
	# 									'photographers', 'travel'])}
	y_data = dfs.username.apply(lambda x: y_dict[x]).values

	vis_data = bh_sne(x_data)
	vis_x = vis_data[:,0]
	vis_y = vis_data[:,1]

	plt.scatter(vis_x, vis_y, c=y_data, cmap=plt.cm.get_cmap("jet", 28))
	cbar = plt.colorbar()
	cbar.set_ticks([i*29./28 + 29./56 for i in range(28)])
	# cbar.set_ticklabels(y_dict.keys())
	cbar.set_ticklabels(zip(dfs.username.unique(), [user_cat_dict[i] for i in dfs.username.unique()]))
	plt.clim(0, 29)
	plt.title('tsne, fc7, 100img_per_user, 4user_per_categ')
	plt.show()
 def run(self):
     config = Config.get()
     # Create the embedding.
     featureDict = Utils.read_features(config.getSample("ExternalFiles",
                                                       "vecs_with_id"),
                                       id_set=getSampleIds())
     keys = list(featureDict.keys())
     vectors = np.array([featureDict[vID]["vector"] for vID in keys])
     out = bh_sne(vectors,
                  pca_d=None,
                  theta=config.getfloat("PreprocessingConstants", "tsne_theta"))
     X, Y = list(out[:, 0]), list(out[:, 1])
     Utils.write_tsv(config.getSample("ExternalFiles", "article_embedding"),
                     ("index", "x", "y"), keys, X, Y)
예제 #43
0
파일: python2_tsne.py 프로젝트: amsqr/hd
def extract_w2v_tsne_feat():
    """
    Extract w2v tsne features.
    Note: python2 only. Worst in cv, so do not use this.   
    """  
    df_w2v_feats = pd.read_csv('tmp2/df_w2v_feats.csv', index_col=0)
    X = df_w2v_feats.values
         
    df_feat = pd.DataFrame(index=df_w2v_feats.index.values)
    
    X_scaled = StandardScaler().fit_transform(X)
    X_tsne = bh_sne(X_scaled)
    df_feat['tsne_t_1'] = X_tsne[:len(df_w2v_feats), 0]
    df_feat['tsne_t_2'] = X_tsne[:len(df_w2v_feats), 1]
    df_feat.to_csv('tmp2/df_tsne_w2v_feats.csv')
def make_sample_df(labels, np, labeled_data, limit, algorithm_name, dims, cores):
  used_labels = np.unique(labels)[0:3]
  label_dfs = []
  for label in used_labels:
    
      subset = labeled_data[labeled_data[:,0] == label,1:]   # select all those elements with this label
      # sub-sample the stratified subset
      num_samples = min(limit,subset.shape[0])
      indices = np.arange(subset.shape[0])
      np.random.shuffle(indices)
      sampled_pts = subset[indices[:num_samples],:]        
      data_2d = bh_sne(sampled_pts)
      num_records = data_2d.shape[0]
      label_dfs.append(pd.DataFrame({"X": data_2d[:,0], "Y": data_2d[:,1], "dimension": [dims for i in range(num_records)], "label": [label_dict[label] for i in range(num_records)], "algorithm": [algorithm_name for i in range(num_records)]}))
  return label_dfs
def main(datafile, normalize, ndims, copula, clusteroutput, subsample):
    X, features = read_sah_h5(datafile)
    I, all_features = read_sah_h5(datafile, just_good=False)
    if 'id' in all_features:
        ids = X[:, all_features.index('id')]
    else:
        ids = np.arange(len(X)).astype(int)

    Xorig = X
    if normalize:
        mean = np.average(X, axis=0)
        std = np.std(X, axis=0)
        std[np.nonzero(std == 0.0)] = 1.0 # Avoid NaNs
        X = (X - mean) / std

    idx = np.random.randint(len(X), size=subsample)

    X = X[idx]
    ids = ids[idx]

    if copula:
        X = np.column_stack([copula_transform(x) for x in X.T])

    # I added this for the time/freq clustering
    # to emphasize the frequency feature
    # X[:, 1] *= 1e-3

    Y = bh_sne(X, d=ndims)

    dbscan = DBSCAN(eps=1.75, min_samples=5)
    C = dbscan.fit_predict(Y)

    tree = ExtraTreesClassifier(n_estimators=100)
    tree.fit(X, C)
    for f, i in zip(features, tree.feature_importances_):
        print '%s: %f' % (f, i)

    with open(clusteroutput, 'w+') as f:
        for c, i in zip(C, ids):
            f.write('%d,%d\n' % (i, c))

    pl.scatter(Y[:, 0], Y[:, 1], color=pl.cm.spectral(C.astype(float) / np.max(C)))

    for c in np.unique(C):
        pl.bar(0, 0, lw=0, ec='none', fc=pl.cm.spectral(float(c) / np.max(C)), label='Cluster %d' % c)
    pl.legend()

    pl.show()
    def _fit_transform(self, x_in):
        """ fit to data, and return the transform
        Args:
            x (numpy.array): Input numpy array

        Returns:
            x (numpy.array): Transformed array
        """

        x_in = x_in.astype(float)
        res = _tsne.bh_sne(
            x_in,
            perplexity=self.perplexity,
            theta=self.theta
            )
        return res
예제 #47
0
def read_sne_video():
    my_data = np.genfromtxt('./test_data.csv', delimiter=',') # test data is 
    labels = np.genfromtxt('./test_labels.csv', delimiter=',')

    print "data incoming shape", my_data.shape
    # getting X, y and labels - also trims the NaNs

    # labels = my_labels[:,0]

    # keeping the data in 2D format
    # should trim the third column
    data = my_data[:,:-1]

    X_2d = bh_sne(data, perplexity=19.0, theta=0.5)

    makeVideo(X_2d, labels)
예제 #48
0
def tsne_pca():

    # mongo stuff
    dbClient = DatabaseClient()

    filteredResults = dbClient.query()

    if filteredResults is None:
      print "No results"
      return

    filteredId = filteredResults[4]['_id']
    experiment = dbClient.get(filteredId)

    list_of_coords = experiment['DATA']['TSNE_DATA']

    pca_list = []
    for coords in list_of_coords:
        np_val = np.asarray(coords)
        coords_array = np.reshape(coords, (-1,2))

        cast = castPCA2(coords_array)
        print "cast: ", cast.shape

        cast_veri = varimax(cast)
        print "cast_veri", cast_veri.shape
        pca_list.append(cast_veri)

    np_pca = np.asarray(pca_list)

    print "pca: ", np_pca.shape
    np_pca = np.reshape(np_pca, (6,-1))
    print "pca: ", np_pca.shape

    labels = np.asarray([1,2,3,4,5,6])

    print "SNEPCA BH"
    sne_pca = bh_sne(np_pca, perplexity=1.0, theta=0.5)
    plt.scatter(sne_pca[:,0], sne_pca[:,1], c=labels)
    plt.show()

    flat_coords = np.reshape(sne_pca, (1,-1))
    flat_coords = flat_coords.tolist()[0]

    experiment['DATA']['PCA'] = flat_coords

    updatedObject = dbClient.update(filteredId, experiment)
예제 #49
0
def process_files(in_file, out_file):
  """
  Read data from in_file, and output to out_file
  """

  sys.stderr.write('# in_file = %s, out_file = %s\n' % (in_file, out_file))
  # input
  sys.stderr.write('# Input from %s.\n' % (in_file))
  inf = codecs.open(in_file, 'r', 'utf-8')

  # output
  sys.stderr.write('Output to %s\n' % out_file)
  check_dir(out_file)
  ouf = codecs.open(out_file, 'w', 'utf-8')

  line_id = 0
  words = []
  embs = []
  num_dim = -1
  all_lines = inf.readlines()
  num_words = len(all_lines)
  sys.stderr.write('# Processing file %s ...\n' % (in_file))
  sys.stderr.write('# num words = %d\n' % (num_words))
  for line in all_lines:
    line = clean_line(line)
    tokens = re.split('\s+', line)
    word = tokens[0]
    if line_id==0:
      num_dim = len(tokens)-1
      sys.stderr.write('# num dims = %d\n' % (num_dim))
      X = np.zeros((num_words, num_dim))
    emb = np.array(tokens[1:], dtype='|S4')
    emb = emb.astype(np.float)
    X[line_id, :] = emb

    line_id = line_id + 1
    if (line_id % 10000 == 0):
      sys.stderr.write(' (%d) ' % line_id)

  sys.stderr.write('Done! Num lines = %d\n' % line_id)

  X_2d = bh_sne(X)
  for ii in xrange(num_words):
    ouf.write('%f %f\n' % (X_2d[ii, 0], X_2d[ii, 1]))
  inf.close()
  ouf.close()
예제 #50
0
def pca_tsne():

  my_data = np.genfromtxt('../data/CSV/appended_data.csv', delimiter=',')
  labels = np.genfromtxt('../data/CSV/appended_data_labels.csv', delimiter=',')

  print "data incoming shape", my_data.shape
  print "labels", labels
  # getting X, y and labels - also trims the NaNs

  print "shape lab", labels.shape
  labels = labels[0:60]

  # labels = my_labels[:,0]

  # keeping the data in 2D format
  # should trim the third column
  # X_2d = my_data[:,:-1]
  X_2d = my_data

  new = X_2d[0:10000]
  print "new ", new.shape
  pca = PCA(n_components = new.shape[1])
  new = pca.fit_transform(new)
  new = np.reshape(new, (1,-1))
  print "XR: ", new.shape

  for t in range(60):
    if (t != 0):
      X_ = X_2d[t*10000:t*10000+10000]
      print "X_: ", X_.shape
      pca = PCA(n_components = X_.shape[1])
      X_r = pca.fit_transform(X_)
      print "XR: ", X_r.shape
      X_r = np.reshape(X_r, (1,-1))
      new = np.append(new, X_r, axis=0)
      print "new: ", new.shape
    
  print new.shape

  X_bn = bh_sne(new[:1000], perplexity=5.0, theta=0.5)

  data0 = X_bn.shape[0]
  data1 = X_bn.shape[1]

  # # plot & save
  plot_save(X_bn, labels, data0, data1, "bn_sne")
예제 #51
0
def get_tsne_mapping(materials_list=None):
    if materials_list is None:
        # Doesn't call get_materials_list() when module is loaded
        materials_list = get_materials_list()
    try:
        _log.info('Trying data cache for t-SNE mapping')
        with open('tsne_points.pickle') as f:
            _log.info('Using pickled t-SNE points')
            return pickle.load(f)
    except IOError:
        X = vectorize_random(4)(materials_list)
        X_2d = bh_sne(X)
        _log.info('t-SNE plot at {}'.format(plot_tsne(X_2d)))
        point_map = [{'pt': pt, 'material': m} for pt, m in
                     zip(X_2d, materials_list)]
        with open('tsne_points.pickle', 'w') as f:
            pickle.dump(point_map, f)
        return point_map
예제 #52
0
파일: mds.py 프로젝트: kdinkla/ProtoMPDA
def mds(dataSet):
    # Load all feature columns.
    rows = featureColumns(dataSet).transpose()

    sampledRows = rows[np.random.randint(len(rows), size=sampleSize)]

    print sampledRows

    print "Begin TSNE."
    projection = bh_sne(sampledRows, perplexity=5) #perplexity=math.sqrt(len(sampledRows)))
    print "End TSNE."

    #model = TSNE(n_components=2, random_state=0)
    #projection = model.fit_transform(sampledRows)

    # Save intermediate MDS.
    np.save(valuesFile, sampledRows)
    np.save(projectionFile, projection)
예제 #53
0
def make_multiple_cl_tsne(mat, cmap_left=None, cmap_right=None,
                           skl_version=True, random_state=0,
                           learning_rate=40):
    from matplotlib import pyplot as plt
    import numpy as np

    # the matrix needs to be transposed in order to cluster the numbers
    x_data = mat.transpose()

    # convert image data to float64 matrix. float64 is need for bh_sne
    x_data = np.asarray(x_data).astype('float64')

    if skl_version == False:
        from tsne import bh_sne
        # perform t-SNE embedding, lowered perplexity
        vis_data = bh_sne(x_data, perplexity=7)
        vis_x = vis_data[:, 0]
        vis_y = vis_data[:, 1]

    else:
        from sklearn import manifold
        # run tsne from sklearn
        ###########################
        tsne = manifold.TSNE(perplexity=7, n_iter=100000,
            random_state = random_state, method='exact', metric='correlation',
            learning_rate=learning_rate, verbose=0,
            n_iter_without_progress=1000, init='random', early_exaggeration=4)

        Y = tsne.fit_transform(x_data)
        vis_x = Y[:, 0]
        vis_y = Y[:, 1]

    fig, axarr = plt.subplots(ncols=2, figsize=(10,5))

    marker_size = 150

    # always require cmap
    axarr[0].scatter(vis_x, vis_y, c=cmap_left, \
        cmap=plt.cm.get_cmap('prism',len(cmap_left)), s=marker_size)

    axarr[1].scatter(vis_x, vis_y, c=cmap_right, \
        cmap=plt.cm.get_cmap('jet',len(cmap_right)), s=marker_size)

    plt.show()
예제 #54
0
def plot_bn_sne(data, labels, size):

  print "data[0]: ", data.shape
  print "labels[0]: ", labels.shape

  # trim the data & labels down to reasonable size
  data = data[0:size]
  labels = labels[0:size]

  # sizes
  data0 = data.shape[0]
  data1 = data.shape[1]

  # dimensionality reduction with bn_sne
  X_2d = bh_sne(data, perplexity=19.0, theta=0.5)
  print "plot shape: ", X_2d.shape

  # plot & save
  plot_save(X_2d, labels, data0, data1, "bn-SNE")
예제 #55
0
def meta_pca_sne(): # put exID back
    # mongo stuff
    dbClient = DatabaseClient()

    filteredResults = dbClient.query()

    if filteredResults is None:
      print "No results"
      return

    filteredId = filteredResults[0]['_id']
    experiment = dbClient.get(filteredId)

    list_of_coords = experiment['DATA']['TSNE_DATA']

    np_list = np.asarray(list_of_coords)
    N, X = np_list.shape
    print "NX", N, X


    print "L0T", type(np_list[0])
    print "L0TI", type(np_list[0][0])
    print "L0S", np_list[0].shape
    print "L0", np_list[0]
    print "L1", np_list[1]

    # labels = np.asarray([1,2,3,4,5,6])
    np_list = np_list[:,:500]

    # print "LIST", np_list
    # print "list size:", np_list.shape

    print "META BH"
    sne_co = bh_sne(np_list, perplexity=1.0, theta=0.5)
    # plt.scatter(sne_co[:,0], sne_co[:,1], c=labels)
    # plt.show()

    flat_coords = np.reshape(sne_co, (1,-1))
    flat_coords = flat_coords.tolist()[0]

    experiment['DATA']['META'] = flat_coords

    updatedObject = dbClient.update(filteredId, experiment)
예제 #56
0
def tsne_visualize(L, labels, outfile='figs/tsne.jpg', perplexity=1):
        """Visualize L using t-sne, which is a little complicated to setup on your system"""
        if not tsne_installed:
                print 'Sorry, tsne is not installed'
                return

        # Use t-sne algorithm to come up with points on 2d plane
        points = bh_sne(L, perplexity=perplexity)
        points = np.array(points)

        data_x, data_y = points[:,0], points[:,1]
                
        fig = plt.figure()
        fig.suptitle('Word vectors T-SNE '+str(perplexity))

        annotate(data_x, data_y, labels)

        # save file
        if outfile is not None:
                fig.savefig(outfile)
        plt.show()
예제 #57
0
	def tag_article_tsne_plot(self, sample_size = 20000):
		samples = numpy.random.randint(0,1000000,sample_size )
		self.combined_matrix = numpy.concatenate((self.doc_vec[samples, ],self.tag_vec), axis =0)
		self.combined_2d = bh_sne(self.combined_matrix)

		font = { 'fontname':'Tahoma', 'fontsize':0.1, 'verticalalignment': 'top', 'horizontalalignment':'center' }
		pylab.subplots_adjust(bottom =0.1)
		pylab.scatter(self.combined_2d[:sample_size+1,0], self.combined_2d[:sample_size+1,1], marker = '.' ,cmap = pylab.get_cmap('Spectral'))
		pylab.scatter(self.combined_2d[sample_size+1:,0], self.combined_2d[sample_size+1:,1], marker ='x' , cmap =pylab.get_cmap('Spectral'))
		pylab.title('NYT Articles and Labels(1991-2007)')
		pylab.xlabel('X')	
		pylab.ylabel('Y')
		for label, x, y in zip(self.tags, self.combined_2d[sample_size+1:, 0], self.combined_2d[sample_size+1:, 1]):
			    pylab.annotate(
			        label, 
			        xy = (x, y), xytext = None,
			        ha = 'right', va = 'bottom', **font)
			        #,textcoords = 'offset points',bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
			        #arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))		

		pylab.savefig('/mnt/data/tag_article_plot', bbox_inches ='tight', dpi = 1000, orientation = 'landscape', papertype = 'a0')
		pylab.close()
예제 #58
0
def reduce_dimensions(matrix, reduction_type, n_components):
    """
    Reduces the dimensionality of a matrix and returns it.
    :param matrix: The matrix to reduce.
    :param reduction_type: The style of reduction to carry out.
    :param n_components: The number of components to allow.
    :return: A matrix whose dimensionality has been reduced.
    """
    reduced_matrix = None
    if reduction_type is ReductionTypes.PCA:
        model = PCA(n_components=n_components, whiten=False)
        reduced_matrix = model.fit_transform(matrix)
    elif reduction_type is ReductionTypes.sPCA:
        model = SparsePCA(n_components=n_components, alpha=.5)
        reduced_matrix = model.fit_transform(matrix)
    elif reduction_type is ReductionTypes.T_SNE:
        reduced_matrix = bh_sne(matrix.transpose())
    elif reduction_type is ReductionTypes.NMF:
        model = ProjectedGradientNMF(n_components=n_components,
                                     init='nndsvd',
                                     random_state=0)
        reduced_matrix = model.fit_transform(matrix)
    return reduced_matrix