示例#1
0
def e123(fixed_projections=False,fixed_embedding=False,
         visualization_method='mds',smart=False,**kwargs):
    X = np.genfromtxt('samples/123/123.csv',delimiter=',')
    X1 = np.genfromtxt('samples/123/1.csv',delimiter=',')
    X2 = np.genfromtxt('samples/123/2.csv',delimiter=',')
    X3 = np.genfromtxt('samples/123/3.csv',delimiter=',')  
    proj = projections.PROJ()
    Q = proj.generate(number=3,method='cylinder')
    if fixed_projections:
        fixed_projections = Q
    else:
        fixed_projections = None
    if fixed_embedding:
        fixed_embedding = X
    else:
        fixed_embedding = None
    mv = MPSE([X1,X2,X3],fixed_embedding=fixed_embedding,
              fixed_projections=fixed_projections,verbose=2,
              sample_colors=X1[:,0],
              visualization_method=visualization_method,**kwargs)
    mv.plot_embedding(title='initial embedding')
    if smart and fixed_projections is False and fixed_embedding is False:
        mv.smart_initialize()
        mv.plot_embedding(title='smart initialize')
    mv.gd(**kwargs)
    mv.plot_computations()
    mv.plot_embedding(title='final embeding')
    mv.plot_images()
    plt.draw()
    plt.pause(0.2)
示例#2
0
def time():
    print('\n***mpse_test.time()***')
    N = [int(10**a) for a in [1, 1.5, 2, 2.5]]
    repeats = 3
    successes = np.zeros(len(N))
    ratios = np.zeros(len(N))
    time = np.zeros(len(N))
    for i in range(len(N)):
        for j in range(repeats):
            X = misc.disk(N[i], dim=3)
            proj = projections.PROJ()
            Q = proj.generate(number=3, method='standard')
            D = multigraph.multigraph_from_projections(proj, Q, X)
            vis = mpse.MPSE(D, verbose=1)
            vis.gd(min_step=1e-4, verbose=1)
            if vis.cost < 1e-3:
                successes[i] += 1
                time[i] += vis.H['time']
        if successes[i] != 0:
            time[i] /= successes[i]
            ratios[i] = successes[i] / repeats

    fig = plt.plot()
    plt.loglog(N, time)
    plt.xlabel('number of points')
    plt.ylabel('time')
    plt.title('computation time')
    plt.show()
示例#3
0
def 123():
    X = np.genfromtxt('samples/123/123.csv',delimiter=',')
    X1 = np.genfromtxt('samples/123/1.csv',delimiter=',')
    X2 = np.genfromtxt('samples/123/2.csv',delimiter=',')
    X3 = np.genfromtxt('samples/123/3.csv',delimiter=',')
    proj = projections.PROJ()
    Q = proj.generate(number=3,method='cylinder')
    return [X1,X2,X3], (X,Q)
示例#4
0
def time(n_samples,
         n_perspectives,
         fixed_projections=False,
         batch_size=20,
         method='random',
         trials=50,
         attempts=3,
         best=40,
         verbose=0,
         max_iter=500):
    proj = projections.PROJ()
    times = []
    for k in range(trials):
        X = misc.disk(n_samples, dim=3)
        Q = proj.generate(number=n_perspectives, method=method)
        data = setup.setup_distances_from_multiple_perspectives(
            proj.project(Q, X))
        if fixed_projections:
            Q0 = Q
        else:
            Q0 = None

        best_time = np.Inf
        best_cost = np.Inf
        for i in range(attempts):
            mv = mpse.MPSE(data, fixed_projectiosn=Q0)
            mv.gd(batch_size=batch_size,
                  max_iter=max_iter,
                  min_cost=1e-3,
                  min_grad=1e-8)
            if verbose > 1:
                print(k, i, mv.cost, mv.time)
            if mv.cost < 1.5e-3 and mv.time < best_time:
                best_time = mv.time
                best_cost = mv.cost
            if best_cost < 1.5e-3:
                times.append(best_time)
        #mv.plot_computations()
        #mv.plot_embedding()
        #mv.plot_images()
        #plt.show()
    print(len(times), np.average(np.sort(times)[0:best]))
示例#5
0
def disk(N=100,fixed_projections=False,fixed_embedding=False,**kwargs):
    X = misc.disk(N,dim=3)
    proj = projections.PROJ()
    Q = proj.generate(number=3,method='standard')
    X1, X2, X3 = proj.project(Q,X)
    if fixed_projections:
        mv = MPSE([X1,X2,X3],Q=Q,verbose=2,sample_colors=X1[:,0])
    elif fixed_embedding:
        mv = MPSE([X1,X2,X3],X=X,verbose=2,sample_colors=X1[:,0])
    else:
        mv = MPSE([X1,X2,X3],verbose=2,sample_colors=X1[:,0])
    mv.plot_embedding(title='initial embedding')
    if fixed_projections:
        mv.gd(fixed_projections=True,**kwargs)
    elif fixed_embedding:
        mv.gd(fixed_embedding=True,**kwargs)
    else:
        mv.gd(**kwargs)
    mv.plot_computations()
    mv.plot_embedding(title='final embeding')
    mv.plot_images()
    plt.draw()
    plt.pause(0.2)
示例#6
0
 def add_projections(self, attributes=3, d1=3, X=None, Q=None, **kwargs):
     """\
     Adds attributes from projections.
     """
     assert self.attributes == 0
     if X is None:
         X = misc.disk(self.node_number, dim=d1)
     else:
         assert isinstance(X, np.ndarray)
         node_number, dim = X.shape
         assert node_number == self.node_number
         d1 = dim
     self.X = X
     if self.node_colors is None:
         self.node_colors = X[:, 0]
     proj = projections.PROJ(d1=d1, **kwargs)
     if Q is None or isinstance(Q, str):
         Q = proj.generate(number=attributes, method=Q, **kwargs)
     else:
         assert len(Q) == attributes
     self.Q = Q
     for k in range(attributes):
         Y = proj.project(Q[k], X)
         self.add_feature(Y, **kwargs)
示例#7
0
def comparison():
    n_samples = np.array(10**np.arange(1.5, 4.01, .5), dtype=int)
    N = len(n_samples)
    n_perspectives = [2, 3, 4, 5]
    K = len(n_perspectives)
    trials = 2
    best = 3

    timef = np.empty((N, K, trials))
    timev = np.empty((N, K, trials))

    proj = projections.PROJ()

    for i in range(N):
        for j in range(K):
            for k in range(trials):
                X = misc.disk(n_samples[i], dim=3)
                Q = proj.generate(number=n_perspectives[j], method='random')
                data = proj.project(Q, X)
                X0 = misc.disk(n_samples[i], dim=3)

                mvf = mpse.MPSE(data,
                                fixed_projections=Q,
                                initial_embedding=X0)
                mvf.gd(batch_size=20, max_iter=500, min_cost=1e-4)
                timef[i, j, k] = mvf.time
                print(i, j, k, mvf.cost, mvf.time)
                mvf.plot_computations()
                plt.show()

                mvv = mpse.MPSE(data, initial_embedding=X0)
                mvv.gd(batch_size=20, max_iter=500, min_cost=1e-4)
                timev[i, j, k] = mvv.time
                print(mvv.cost, mvv.time)
                mvv.plot_computations()
                plt.show()
示例#8
0
def mload(dataset, n_samples=100, n_perspectives=2, **kwargs):
    "returns dictionary with datasets"

    distances = []
    data = {}
    if dataset == 'equidistant':
        length = n_samples * (n_samples - 1) // 2
        for persp in range(n_perspectives):
            distances.append(np.random.normal(1, 0.1, length))
        data['image_colors'] = n_samples - 1
    elif dataset == 'disk':
        import misc, projections
        X = misc.disk(n_samples, dim=3)
        proj = projections.PROJ()
        Q = proj.generate(number=n_perspectives, method='random')
        Y = proj.project(Q, X)
        data['true_images'] = Y
        data['true_embedding'] = X
        data['true_projections'] = Q
        distances = Y
        data['image_colors'] = 0
    elif dataset == 'clusters2a':
        from clusters import createClusters
        D, data['image_colors'] = \
            createClusters(n_samples, n_perspectives)
    elif dataset == 'clusters':
        from clusters import clusters
        distances = []
        data['image_classes'] = []
        data['image_colors'] = []
        if 'n_clusters' in kwargs:
            n_clusters = kwargs.pop('n_clusters')
        if isinstance(n_clusters, int):
            n_clusters = [n_clusters] * n_perspectives
        else:
            n_perspectives = len(n_clusters)
        for i in range(n_perspectives):
            d, c = clusters(n_samples, n_clustesr=n_clusters[i], **kwargs)
            distances.append(d)
            data['image_classes'].append(c)
            data['image_colors'].append(c)
    elif dataset == 'clusters2':
        from clusters import clusters2
        distances = []
        data['image_colors'] = []
        if 'n_clusters' in kwargs:
            n_clusters = kwargs['n_clusters']
        if isinstance(n_clusters, int):
            n_clusters = [n_clusters] * n_perspectives
        for persp in range(n_perspectives):
            d, c = clusters2(n_samples, n_clusters[persp])
            distances.append(d)
            data['image_colors'].append(c)
    elif dataset == '123':
        import projections
        X = np.genfromtxt(directory + '/123/123.csv', delimiter=',')
        X1 = np.genfromtxt(directory + '/123/1.csv', delimiter=',')
        X2 = np.genfromtxt(directory + '/123/2.csv', delimiter=',')
        X3 = np.genfromtxt(directory + '/123/3.csv', delimiter=',')
        proj = projections.PROJ()
        Q = proj.generate(number=3, method='cylinder')
        distances = [X1, X2, X3]
        data['true_embedding'] = X
        data['true_projections'] = Q
        data['true_images'] = [X1, X2, X3]
        data['colors'] = True
    elif dataset == 'florence':
        import florence
        distances, dictf = florence.setup()
        for key, value in dictf.items():
            data[key] = value
    elif dataset == 'credit':
        import csv
        path = directory + '/credit/'
        Y = []
        for ind in ['1', '2', '3']:
            filec = open(path + 'discredit3_tsne_cluster_1000_' + ind + '.csv')
            array = np.array(list(csv.reader(filec)), dtype='float')
            array += np.random.randn(len(array), len(array)) * 1e-4
            Y.append(array)
        distances = Y
    elif dataset == 'phishing':
        import phishing
        features = phishing.features
        labels = phishing.group_names
        if n_samples is None:
            n_samples = len(features[0])
        Y, perspective_labels = [], []
        for group in [0, 1, 2, 3]:
            assert group in [0, 1, 2, 3]
            Y.append(features[group][0:n_samples])
            perspective_labels.append(labels[group])
        sample_colors = phishing.results[0:n_samples]
        distances = Y
        data['sample_colors'] = sample_colors
        data['perspective_labels'] = perspective_labels
    elif dataset == 'mnist':
        X, data['sample_colors'] = mnist(**kwargs)
        data['features'] = X
        distances = [X[:, 0:28 * 14], X[:, 28 * 14::]]
        data['sample_classes'] = data['sample_colors']
    else:
        print('***dataset not found***')
    return distances, data
示例#9
0
    def __init__(self, data, weights=None, data_args=None,
                 fixed_embedding=None, fixed_projections=None,
                 initial_embedding=None, initial_projections=None,
                 visualization_method='mds', visualization_args={},
                 total_cost_function='rms',
                 embedding_dimension=3, image_dimension=2,
                 projection_family='linear',projection_constraint='orthogonal',
                 hidden_samples=None,
                 sample_labels=None, perspective_labels=None,
                 sample_colors=None, image_colors=None,
                 verbose=0, indent='',
                 **kwargs):
        """\
        Initializes MPSE object.

        Parameters
        ----------

        data : list, length (n_perspectives)
        List containing distance/dissimilarity/feature data for each 
        perspective. Each array can be of the following forms:
        1) A 1D condensed distance array
        2) A square distance matrix
        3) An array containing features
        ***4) A dictionary describing a graph

        weights : None or string or array or list
        If visualization allows for it, weights to be used in computation of
        cost/gradiant of each perspective.
        IF a list is given, then the list must have length equal to the number
        of perspectives. Otherwise, it is assumed that the given weights are the
        same for all perspectives.
        The possible weights are described in setup.setup_weights. These are:
        1) None : no weights are used
        2) string : method to compute weights based on distances
        3) function : function to compute weights based on distances
        4) array : array containing pairwise weights or node weights, depending
        on size (must be of length of distances or of samples).

        data_args : dictionary (optional) or list
        Optional arguments to pass to distances.setup().
        If a list is passed, then the length must be the number of perspectives
        and each element must be a dictionary. Then, each set of distances will
        be set up using a different set of arguments.

        fixed_embedding : array
        If an array is given, this is assumed to be the true embedding and
        by default optimization is done w.r.t. the projections only.

        fixed_projections : list
        If a list is given, this is assumed to be the true projections and by
        default optimization is done w.r.t. the embedding coordinates only.

        initial_embedding : array
        If given, this is the initial embedding used.

        initial_projections : list
        If given, this is the initial projections used.

        visualization_method : str
        Visualization method. Current options are 'mds' and 'tsne'.
        The visualization method can be different for different perspectives, 
        by passing a list of visualization methods instead.

        visualization_args : dict
        Dictionary with arguments to pass to each visualization method.
        Different arguments can be passed to different visualization methods
        by passing a list of dictionaries instead.

        embedding_dimension : int
        Dimension of embedding.

        image_dimension : int
        Dimension of image (after projection). Each perspective can have a
        different image dimension, by specifying a list instead.

        projection_family : str
        Projection family. Options are 'linear'.

        projection_constraint : str
        Constraints on projection family. Options are None, 'orthogonal',
        'similar'.

        embedding_dimension : int > 0
        Dimension of the embedding.
        Alternative name: embedding_dimension

        projection_dimension : int or array
        Dimension of projections. Can be different for each perspective.
 
        persp : Object instance of projections.Persp class or int > 0.
        Describes set of allowed projection functions and stores list of
        projection parameters. See perspective.py. If instead of a Persp object
        a positive integer int is given, then it is assumed that 
        embedding_dimension=image_dimension=int
        and that all projections are the identity.

        sample_labels : list (optional)
        List containing labels of samples (used in plots).

        sample_colors : array (optional)
        Array containing color value of samples (used in plots).

        image_colors : array-like, shape (n_perspectives, n_samples)
        Colors for each image.
        """
        self.verbose, self.indent = verbose, indent
        if verbose > 0:
            print(indent+'mview.MPSE():')

        ##set up sets of distances from data
        self.distances = setup.setup_distances_from_multiple_perspectives(
            data, data_args)
        self.n_perspectives = len(self.distances)
        self.n_samples = scipy.spatial.distance.num_obs_y(self.distances[0])

        ##set up weights from data
        if isinstance(weights,list) or isinstance(weights, np.ndarray):
            assert len(weights) == self.n_perspectives
            self.weights = weights
        else:
            self.weights = [weights]*self.n_perspectives
        for i in range(self.n_perspectives):
            self.weights[i] = setup.setup_weights(self.distances[i], \
                                self.weights[i], min_weight = 0)

        ##set up parameters
        self.embedding_dimension = embedding_dimension
        self.image_dimension = image_dimension
        self.projection_family = projection_family
        self.projection_constraint = projection_constraint
        proj = projections.PROJ(embedding_dimension,image_dimension,
                                projection_family,projection_constraint)
        self.proj = proj

        ##set up hidden samples
        if hidden_samples is not None:
            assert isinstance(hidden_samples, list)
            assert len(hidden_samples) == self.n_perspectives
        self.hidden_samples = hidden_samples

        if verbose > 0:
            print(indent+'  data details:')
            print(indent+f'    number of perspectives : {self.n_perspectives}')
            print(indent+f'    number of samples : {self.n_samples}')
            print(indent+'  visualization details:')
            print(indent+'    embedding dimension :',self.embedding_dimension)
            print(indent+f'    image dimension : {self.image_dimension}')
            print(indent+f'    visualization type : {visualization_method}')

        #setup sample labels:
        if sample_labels is not None:
            assert len(sample_labels) == self.n_samples
        self.sample_labels = sample_labels
        #setup perspective labels:
        if perspective_labels is None:
            perspective_labels = range(1,self.n_perspectives+1)
        else:
            assert len(perspective_labels) == self.n_perspectives
        self.perspective_labels = perspective_labels
        
        #setup colors:
        self.sample_colors = sample_colors
        self.image_colors = image_colors

        #setup visualization instances:
        self.visualization_instances = []
        self.visualization_method = visualization_method
        if isinstance(visualization_method,str):
            visualization_method = [visualization_method]*self.n_perspectives
        if isinstance(visualization_args,dict):
            visualization_args = [visualization_args]*self.n_perspectives
        for i in range(self.n_perspectives):
            assert visualization_method[i] in ['mds','tsne']
            if self.verbose > 0:
                print('  setup visualization instance for perspective',
                      self.perspective_labels[i],':')
            if visualization_method[i] == 'mds':
                vis = mds.MDS(self.distances[i],
                              weights = self.weights[i],
                              embedding_dimension=self.image_dimension,
                              verbose=self.verbose, indent=self.indent+'    ',
                              **visualization_args[i])
            elif visualization_method[i] == 'tsne':
                vis = tsne.TSNE(self.distances[i],
                                embedding_dimension=self.image_dimension,
                                verbose=self.verbose, indent=self.indent+'    ',
                                **visualization_args[i])
            self.visualization_instances.append(vis)
        self.visualization = self.visualization_instances

        #setup objectives:
        if total_cost_function == 'rms':
            self.total_cost_function = lambda individual_costs : \
                np.sqrt(np.sum(individual_costs**2)/self.n_perspectives)
        else:
            assert callable(total_cost_function)
            self.total_cost_function = total_cost_function
        def cost_function(X,Q,Y=None,**kwargs):
            if Y is None:
                Y = self.proj.project(Q,X)
            individual_costs = np.zeros(self.n_perspectives)
            for k in range(self.n_perspectives):
                individual_costs[k] = \
                    self.visualization[k].objective(Y[k],**kwargs)
            cost = self.total_cost_function(individual_costs)
            return cost, individual_costs
        self.cost_function = cost_function

        #setup gradient function:
        if self.projection_family == 'linear':
            def gradient(embedding,projections,batch_size=None,indices=None,
                         return_embedding=True,return_projections=True,
                         return_cost=True, return_individual_costs=False):
                """\
                Returns MPSE gradient(s), along with cost and individual costs 
                (optional).

                Parameters
                ----------

                embedding : numpy array
                Current embedding.

                projections : numpy array
                Current projections (as a single array).

                return_embedding : boolean
                If True, returns MPSE gradient w.r.t. embedding.

                return_projections : boolean
                If True, returns MPSE gradient w.r.t. projections. 

                return_cost : boolean
                If True, returns MPSE cost.

                return_individual_costs : boolean
                If True, returns individual embedding costs.
                """
                if return_embedding:
                    dX = np.zeros(embedding.shape)
                if return_projections:
                    dQ = []
                individual_costs = np.empty(self.n_perspectives)
                Y = self.proj.project(projections,embedding)
                for k in range(self.n_perspectives):
                    dY_k, cost_k = self.visualization[k].gradient(
                        Y[k],batch_size=batch_size,indices=indices)
                    individual_costs[k] = cost_k
                    if return_embedding:
                        dX += dY_k @ projections[k][:2, :3]
                    if return_projections:
                        dQ.append(dY_k.T @ embedding)
                if return_embedding:
                    dX /= self.n_perspectives
                cost = self.total_cost_function(individual_costs)
                if return_embedding is False:
                    grad = np.array(dQ)
                elif return_projections is False:
                    grad = dX
                else:
                    grad = [dX,np.array(dQ)]
                if return_individual_costs:
                    return grad, cost, individual_costs
                else:
                    return grad, cost
            self.gradient = gradient
        else:
            def gradient_X(X,Q,Y=None):
                pgradient = self.proj.compute_gradient(X[0],params_list=Q)
                if Y is None:
                    Y = self.proj.project(X,params_list=Q)
                gradient = np.zeros((self.n_samples,self.embedding_dimension))
                for k in range(self.n_perspectives):
                    gradient += self.visualization[k].gradient(Y[k]) \
                                @ pgradient[k]
                return gradient
            self.gradient_X = gradient_X

        #set up initial embedding and projections (fixed optional):
        if verbose > 0:
            print(indent+'  initialize:')
        #set fixed and initial embedding:
        if fixed_embedding is not None:
            if verbose > 0:
                print(indent+'    fixed embedding : True')
            self.embedding = fixed_embedding
            self.initial_embedding = fixed_embedding
            self.fixed_embedding = True
        else:
            if verbose > 0:
                print(indent+'    fixed embedding : False')
            if initial_embedding is None:
                if verbose > 0:
                    print(indent+'    initial embedding : random')
                self.initial_embedding = misc.initial_embedding(
                    self.n_samples,dim=self.embedding_dimension, radius=1)
            else:
                assert isinstance(initial_embedding,np.ndarray)
                assert initial_embedding.shape == (
                    self.n_samples, self.embedding_dimension)
                if verbose > 0:
                    print(indent+'    initial embedding : given')
                self.initial_embedding = initial_embedding
            self.embedding = self.initial_embedding
            self.fixed_embedding = False
        #set fixed and initial projections:
        if fixed_projections is not None:
            if isinstance(fixed_projections,str):
                fixed_projections = self.proj.generate(number= \
                            self.n_perspectives,method=fixed_projections)
            assert(all([isinstance(fp,np.ndarray) for fp in fixed_projections]))
            fixed_projections = [f[:2, :3] for f in fixed_projections]
            self.projections = fixed_projections
            self.initial_projections = fixed_projections
            self.fixed_projections = True
            if verbose > 0:
                print(indent+'    fixed projections : True')
        else:
            if verbose > 0:
                print(indent+'    fixed projections : False')
            if initial_projections is None:
                if verbose > 0:
                    print(indent+'    initial projections : random')
                self.initial_projections = self.proj.generate(
                    number=self.n_perspectives, **kwargs)
            else:
                if verbose > 0:
                    print(indent+'    initial projections : given')
                if isinstance(initial_projections,str):
                    initial_projections = self.proj.generate(number= \
                            self.n_perspectives,method=initial_projections)
                self.initial_projections = initial_projections
            self.projections = self.initial_projections
            self.fixed_projections = False        
        print(indent+' Projection is:')
        print(self.projections)

        self.initial_cost = None
        self.initial_individual_cost = None
        self.computation_history = []
        self.time = 0
        self.update(**kwargs)