Python MDS.fit_transform 예제들, sklearn.manifold.MDS.fit_transform Python 예제들

예제 #1

1

파일 보기

파일: detectiontree.py 프로젝트: cactuskid/Taxonomy_scripts

def create_2dprojection(distmat):
	#uses isomap to return a species distance map in 2d based on the topological distmat of all species in tree
	print 'map to 3d space'
	mapper=MDS(n_components=3, metric=True, n_init=4, max_iter=300, verbose=0, eps=0.001, n_jobs=-1, random_state=0, dissimilarity='precomputed')
	projmat =mapper.fit_transform(distmat)
	print 'DONE'
	return projmat

예제 #2

0

파일 보기

파일: projection.py 프로젝트: thran/experiments2.0

def mds(similarity, euclid=False):
    if euclid:
        model = MDS(max_iter=1000)
        result = model.fit_transform(similarity)
    else:
        model = MDS(max_iter=1000, dissimilarity='precomputed')
        result = model.fit_transform(1 - similarity)

    return result.T

예제 #3

0

파일 보기

파일: proj.py 프로젝트: ywcui1990/nupic.research

def project_in_2D(distance_mat, method='mds'):
  """
  Project SDRs onto a 2D space using manifold learning algorithms
  :param distance_mat: A square matrix with pairwise distances
  :param method: Select method from 'mds' and 'tSNE'
  :return: an array with dimension (numSDRs, 2). It contains the 2D projections
     of each SDR
  """
  seed = np.random.RandomState(seed=3)

  if method == 'mds':
    mds = MDS(n_components=2, max_iter=3000, eps=1e-9,
              random_state=seed,
              dissimilarity="precomputed", n_jobs=1)

    pos = mds.fit(distance_mat).embedding_

    nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
               dissimilarity="precomputed", random_state=seed,
               n_jobs=1, n_init=1)

    pos = nmds.fit_transform(distance_mat, init=pos)
  elif method == 'tSNE':
    tsne = TSNE(n_components=2, init='pca', random_state=0)
    pos = tsne.fit_transform(distance_mat)
  else:
    raise NotImplementedError

  return pos

예제 #4

0

파일 보기

파일: mds_sklearn_sample.py 프로젝트: JackBass/ml-algorithms-simple

def main():
    digits = load_digits()
    X = digits.data
    y = digits.target
    mds = MDS()
    X_mds = mds.fit_transform(X)
    plot_embedding(X_mds, y)

예제 #5

0

파일 보기

파일: digits_plot.py 프로젝트: JackBass/ml-algorithms-simple

def main():
    args = docopt(__doc__)
    is_mds = args['--mds']

    # load datasets
    digits = load_digits()
    X = digits.data
    y = digits.target
    labels = digits.target_names

    # dimension reduction
    if is_mds:
        model = MDS(n_components=2)
    else:
        model = PCA(n_components=2)
    X_fit = model.fit_transform(X)

    for i in range(labels.shape[0]):
        plt.scatter(X_fit[y == i, 0], X_fit[y == i, 1],
                    color=COLORS[i], label=str(i))

    plt.legend(loc='upper left')
    plt.autoscale()
    plt.grid()
    plt.show()

예제 #6

0

파일 보기

파일: HCA_plots.py 프로젝트: IanEisenberg/Self_Regulation_Ontology

 def scale_plot(input_data, data_colors=None, cluster_colors=None,
                cluster_sizes=None, dissimilarity='euclidean', filey=None):
     """ Plot MDS of data and clusters """
     if data_colors is None:
         data_colors = 'r'
     if cluster_colors is None:
         cluster_colors='b'
     if cluster_sizes is None:
         cluster_sizes = 2200
         
     # scale
     mds = MDS(dissimilarity=dissimilarity)
     mds_out = mds.fit_transform(input_data)
     
     with sns.axes_style('white'):
         f=plt.figure(figsize=(14,14))
         plt.scatter(mds_out[n_clusters:,0], mds_out[n_clusters:,1], 
                     s=75, color=data_colors)
         plt.scatter(mds_out[:n_clusters,0], mds_out[:n_clusters,1], 
                     marker='*', s=cluster_sizes, color=cluster_colors,
                     edgecolor='black', linewidth=2)
         # plot cluster number
         offset = .011
         font_dict = {'fontsize': 17, 'color':'white'}
         for i,(x,y) in enumerate(mds_out[:n_clusters]):
             if i<9:
                 plt.text(x-offset,y-offset,i+1, font_dict)
             else:
                 plt.text(x-offset*2,y-offset,i+1, font_dict)
     if filey is not None:
         plt.title(path.basename(filey)[:-4], fontsize=20)
         save_figure(f, filey)
         plt.close()

예제 #7

0

파일 보기

파일: clusterInfoProcessor.py 프로젝트: rubyagarwal/NewsClustering

def plotFlatClusterGraph(tf_idf_matrix, clusters, headlines_utf):
    dist = 1 - cosine_similarity(tf_idf_matrix)
    MDS()
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    pos = mds.fit_transform(dist)
    xs, ys = pos[:, 0], pos[:, 1]
    cluster_colors = {0: '#FE642E', 1: '#B40404', 2: '#D7DF01', 3: '#01DF01', 4: '#00FFBF', 5: '#2E64FE', 6:'#8904B1', 7:'#FA58F4', 8:'#FE2E9A', 9:'#A4A4A4'}

    #create data frame that has the result of the MDS plus the cluster numbers and titles
    df = pandas.DataFrame(dict(x=xs, y=ys, label=clusters, title=headlines_utf)) 
    groups = df.groupby('label')

    # set up plots
    fig, ax = plt.subplots(figsize=(17, 9)) # set size

    #iterate through groups to layer the plots
    for name, group in groups:
        ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, color=cluster_colors[name], mec='none')
        ax.set_aspect('auto')
        ax.tick_params(axis= 'x', which='both', bottom='off', top='off', labelbottom='off')
        ax.tick_params(axis= 'y', which='both', left='off', top='off', labelleft='off')
        ax.legend(numpoints=1)  #show legend with only 1 point

    #add label in x,y position with the label as the film title
    for t_n in range(len(df)):
        ax.text(df.ix[t_n]['x'], df.ix[t_n]['y'], df.ix[t_n]['title'], size=8)  
    
    plt.savefig('../plots/flat_clusters.png', dpi=400)

예제 #8

0

파일 보기

파일: channel_util.py 프로젝트: Qi0116/deepthought

def reorder_channels_by_xyz_coord(data, channel_names=None):
    """
    :param data: 2-d array in the format [n_samples, n_channels]
    :param channel_names: names of the EEG channels
    :return: data, channel_names permutated accordingly
    """
    # work on transposed view, i.e. [channel, samples]
    data = data.T

    # map channels to 1-d coordinates through MDS
    from sklearn.manifold import MDS
    distances = compute_electrode_distance_matrix()
    mds = MDS(n_components=1, dissimilarity='precomputed')
    projection = mds.fit_transform(distances).reshape(data.shape[0])
    order = np.argsort(projection)
    print mds.stress_
    print order

    # re-order channels
    data = data[order]
    # restore initial axes layout
    data = data.T

    # re-order channel_names
    channel_names = reorder_channel_names(channel_names, order)

    return data, channel_names

예제 #9

0

파일 보기

파일: embedding.py 프로젝트: gianlucacorrado/EDeN

def embed_two_dimensions(data, vectorizer, size=10, n_components=5, colormap='YlOrRd'):
    if hasattr(data, '__iter__'):
        iterable = data
    else:
        raise Exception('ERROR: Input must be iterable')
    import itertools
    iterable_1, iterable_2 = itertools.tee(iterable)
    # get labels
    labels = []
    for graph in iterable_2:
        label = graph.graph.get('id', None)
        if label:
            labels.append(label)

    # transform iterable into sparse vectors
    data_matrix = vectorizer.transform(iterable_1)
    # embed high dimensional sparse vectors in 2D
    from sklearn import metrics
    distance_matrix = metrics.pairwise.pairwise_distances(data_matrix)

    from sklearn.manifold import MDS
    feature_map = MDS(n_components=n_components, dissimilarity='precomputed')
    explicit_data_matrix = feature_map.fit_transform(distance_matrix)

    from sklearn.decomposition import TruncatedSVD
    pca = TruncatedSVD(n_components=2)
    low_dimension_data_matrix = pca.fit_transform(explicit_data_matrix)

    plt.figure(figsize=(size, size))
    embed_dat_matrix_two_dimensions(low_dimension_data_matrix, labels=labels, density_colormap=colormap)
    plt.show()

예제 #10

0

파일 보기

파일: clustering.py 프로젝트: msobrevillac/btpucp-clusters

def visualize_clusters(tfidf_matrix, vocabulary, km):

    # calcuate the cosine distance between each document
    # this will be used for plotting on a euclidean (2-dimensional) plane.
    dist = 1 - cosine_similarity(tfidf_matrix)
    clusters = km.labels_.tolist()

    # convert two components as we are plotting points in a two-dimensional plane
    # 'precomputed' because we provide a distance matrix
    # we will also specify 'random_state' so the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
    xs, ys = pos[:, 0], pos[:, 1]

    # set up colors per clusters using a dict
    cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e', 5: '#99cc00'}

    # set up cluster names using a dict (perhaps using the top terms of each cluster)
    cluster_names = {0: '0',
                     1: '1',
                     2: '2',
                     3: '3',
                     4: '4',
                     5: '5'}

    #create data frame that has the result of the MDS plus the cluster numbers and titles
    df = pd.DataFrame(dict(x=xs, y=ys, label=clusters))

    #group by cluster
    groups = df.groupby('label')


    # set up plot
    fig, ax = plt.subplots(figsize=(17, 9)) # set size
    ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

    #iterate through groups to layer the plot
    #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
    for name, group in groups:
        ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
                label=cluster_names[name], color=cluster_colors[name],
                mec='none')
        ax.set_aspect('auto')
        ax.tick_params(\
            axis= 'x',         # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelbottom='off')
        ax.tick_params(\
            axis= 'y',         # changes apply to the y-axis
            which='both',      # both major and minor ticks are affected
            left='off',        # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelleft='off')

    ax.legend(numpoints=1)  #show legend with only 1 point

    plt.show() #show the plot

예제 #11

0

파일 보기

파일: useful_script.py 프로젝트: Kali89/HarryPotterClusters

    def generate_cluster_plot_frame(self):
        MDS()
        mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
        dist = 1 - cosine_similarity(self.tfidf_matrix)
        pos = mds.fit_transform(dist)
        xs, ys = pos[:,0], pos[:,1]

        self.cluster_plot_frame = pd.DataFrame(dict(x=xs, y=ys, label=self.clusters, chapter=self.chapter_list, book=self.book_list))

예제 #12

0

파일 보기

파일: document_clustering.py 프로젝트: 000Nelson000/text-analytics-with-python

def plot_clusters(num_clusters, feature_matrix,
                  cluster_data, movie_data,
                  plot_size=(16,8)):
    # generate random color for clusters                  
    def generate_random_color():
        color = '#%06x' % random.randint(0, 0xFFFFFF)
        return color
    # define markers for clusters    
    markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd']
    # build cosine distance matrix
    cosine_distance = 1 - cosine_similarity(feature_matrix) 
    # dimensionality reduction using MDS
    mds = MDS(n_components=2, dissimilarity="precomputed", 
              random_state=1)
    # get coordinates of clusters in new low-dimensional space
    plot_positions = mds.fit_transform(cosine_distance)  
    x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1]
    # build cluster plotting data
    cluster_color_map = {}
    cluster_name_map = {}
    for cluster_num, cluster_details in cluster_data.items():
        # assign cluster features to unique label
        cluster_color_map[cluster_num] = generate_random_color()
        cluster_name_map[cluster_num] = ', '.join(cluster_details['key_features'][:5]).strip()
    # map each unique cluster label with its coordinates and movies
    cluster_plot_frame = pd.DataFrame({'x': x_pos,
                                       'y': y_pos,
                                       'label': movie_data['Cluster'].values.tolist(),
                                       'title': movie_data['Title'].values.tolist()
                                        })
    grouped_plot_frame = cluster_plot_frame.groupby('label')
    # set plot figure size and axes
    fig, ax = plt.subplots(figsize=plot_size) 
    ax.margins(0.05)
    # plot each cluster using co-ordinates and movie titles
    for cluster_num, cluster_frame in grouped_plot_frame:
         marker = markers[cluster_num] if cluster_num < len(markers) \
                  else np.random.choice(markers, size=1)[0]
         ax.plot(cluster_frame['x'], cluster_frame['y'], 
                 marker=marker, linestyle='', ms=12,
                 label=cluster_name_map[cluster_num], 
                 color=cluster_color_map[cluster_num], mec='none')
         ax.set_aspect('auto')
         ax.tick_params(axis= 'x', which='both', bottom='off', top='off',        
                        labelbottom='off')
         ax.tick_params(axis= 'y', which='both', left='off', top='off',         
                        labelleft='off')
    fontP = FontProperties()
    fontP.set_size('small')    
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True, 
              shadow=True, ncol=5, numpoints=1, prop=fontP) 
    #add labels as the film titles
    for index in range(len(cluster_plot_frame)):
        ax.text(cluster_plot_frame.ix[index]['x'], 
                cluster_plot_frame.ix[index]['y'], 
                cluster_plot_frame.ix[index]['title'], size=8)  
    # show the plot           
    plt.show()

예제 #13

0

파일 보기

파일: stimulus_space.py 프로젝트: theilmbh/NeuralTDA

def mds_embed(graph):

    sorted_node_list = sorted(list(graph.nodes()), key=len)
    dmat = nx.floyd_warshall_numpy(graph, nodelist=sorted_node_list)

    gmds = MDS(n_jobs=-2, dissimilarity="precomputed")
    embed_pts = gmds.fit_transform(dmat)

    return (embed_pts, dmat, sorted_node_list)

예제 #14

0

파일 보기

파일: channel_util.py 프로젝트: Qi0116/deepthought

def compute_2d_mapping(layout):
    sphere_coords = layout.sphere_coords()
    radius = layout.sphere_radius()
    from sklearn.manifold import MDS
    distances = compute_electrode_distance_matrix(sphere_coords, radius)
    mds = MDS(n_components=2, dissimilarity='precomputed')
    projection = mds.fit_transform(distances)
    # print projection.shape
    return projection

예제 #15

0

파일 보기

파일: cluster-dbscan.py 프로젝트: monarch-initiative/monarch-analysis

def convert_matrix_to_coordinates(sym_matrix, components):
    """
    :param sym_matrix: array, [n_samples, n_samples]
    :param components: int: 2 or 3 for MDS
    :return: Output of MDS, xy or xyz coordinates as 2d numpy array
             with shape [n_samples, components]
    """
    # Create coordinates based on multi dimensional scaling
    mds = MDS(n_components=components, dissimilarity="precomputed", random_state=1)
    coordinates = mds.fit_transform(sym_matrix)
    return coordinates

예제 #16

0

파일 보기

파일: estimateAndPlot.py 프로젝트: raisakarasik/sleep-analysis

def plotMDS(X, Y):
    #computes and plots MDS (measure for how well data separates)
    D = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(X))
    tmodel = MDS(n_components=2, dissimilarity='precomputed')
    X2D = tmodel.fit_transform(D)
    plt.figure()
    plt.title('MDS')
    plt.ylabel('MDS1')
    plt.xlabel('MDS2')
    plt.scatter(X2D[:, 0], X2D[:, 1], c=Y)
    plt.show()

예제 #17

0

파일 보기

파일: kmeans.py 프로젝트: andronisa/aa_data_mining

def mds(cos_simil_mtr):
    # convert two components as we're plotting points in a two-dimensional plane
    # "precomputed" because we provide a distance matrix
    # we will also specify `random_state` so the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

    pos = mds.fit_transform(cos_simil_mtr)  # shape (n_components, n_samples)

    xs, ys = pos[:, 0], pos[:, 1]
    print()

    return xs, ys

예제 #18

0

파일 보기

파일: recommendation.py 프로젝트: sshegheva/delphos

 def generate_cluster_plot_frame(self):
     mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
     dist = 1 - cosine_similarity(self.tfidf_matrix)
     pos = mds.fit_transform(dist)
     xs, ys = pos[:, 0], pos[:, 1]
     cluster_data = dict()
     cluster_data["x"] = xs
     cluster_data["y"] = ys
     cluster_data["label"] = self.clusters
     cluster_data["presentation"] = self.presentation_list
     cluster_data["innovation_list"] = self.innovation_list
     self.cluster_plot_frame = pd.DataFrame(cluster_data)

예제 #19

0

파일 보기

파일: cluster.py 프로젝트: koenvb/microscopium

def mds_mapping(X, n_components=2, max_iter=500, n_jobs=-1,
                random_state=None):
    """MDS scaling applied to data matrix X

    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The data matrix
    n_components : int, optional
        Dimensionality of the reduced mapping
    max_iter : int, optional
        Max number of iterations
    n_jobs: int, optional
        Number of compute jobs when fitting scale. -1 means number
        of processors on the current computer.
    random_state : int, optional
        Generator used to initialize, set fixed integer to
        reproduce results for debugging.

    Returns
    -------
    mds_embedding: MDS object
        The embedding object.
    X_transformed : array, shape (n_samples, n_components)
        The transformed data.

    Examples
    --------
    >>> data = np.random.rand(5, 10)
    >>> MDS_reduced, transformed_data = mds_mapping(data, n_components=3)
    >>> transformed_data.shape
    (5, 3)
    """
    mds_embedding = MDS(n_components=n_components, max_iter=max_iter,
                        n_jobs=n_jobs, random_state=random_state)
    mds_embedding.fit_transform(X)
    X_transformed = mds_embedding.embedding_

    return mds_embedding, X_transformed

예제 #20

0

파일 보기

파일: kernel_visualization.py 프로젝트: t-rad/projects

def transform_and_plot_data(seed, distance_matrix, dim_x, dim_y, title, plot3D, ax):
    if plot3D:
        n_components = 3
    else:
        n_components = 2
    mds = MDS(n_components=n_components, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1)
    transformed_data = mds.fit_transform(distance_matrix)

    corner_points, pair_list = create_pairs_to_plot_from_list(transformed_data, dim_x, dim_y)
    if plot3D:
        my_plot3D(corner_points, pair_list, False, title, ax)
    else:
        my_plot2D(corner_points, pair_list, False, title, ax)

예제 #21

0

파일 보기

파일: bib_clustering.py 프로젝트: ntduong/data-science-newbie

def mds_bib_data_with_sklearn(fname):
    
    bib_data = get_bib_data()
    mat, years, term_list, years_cnt = get_year_by_term_mat(bib_data, freq=5)
    
    # Euclidean-based MDS
    aMDS = MDS(n_components=2, dissimilarity='euclidean')
    coords = aMDS.fit_transform(mat)
    fig = plt.figure()
    fig.clf()
    for label, x, y in zip(years, coords[:,0], coords[:,1]):
        plt.annotate(label, xy=(x,y))
    
    plt.savefig(fname)

예제 #22

0

파일 보기

파일: manifold.py 프로젝트: HANNATH/vsm

    def mds(self, n_components=2, dissimilarity='precomputed', show=False): 
        """
        Calculates lower dimention coordinates using the mds algorithm.
        This requires sklearn ver 0.14 due to the dissimilarity argument.

        :param n_components: dimentionality of the reduced space.
        :type n_components: int, optional

        :param show: Shows the calculated coordinates if true.
        :type show: boolean, optional
        """
        model = MDS(n_components=n_components, dissimilarity=dissimilarity, max_iter=100)
        self.pos = model.fit_transform(self.dismat)

        if show:
            return self.pos

예제 #23

0

파일 보기

파일: _prepare.py 프로젝트: bmabey/pyLDAvis

def js_MMDS(distributions, **kwargs):
    """Dimension reduction via Jensen-Shannon Divergence & Metric Multidimensional Scaling

    Parameters
    ----------
    distributions : array-like, shape (`n_dists`, `k`)
        Matrix of distributions probabilities.

    **kwargs : Keyword argument to be passed to `sklearn.manifold.MDS()`

    Returns
    -------
    mmds : array, shape (`n_dists`, 2)
    """
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    model = MDS(n_components=2, random_state=0, dissimilarity='precomputed', **kwargs)
    return model.fit_transform(dist_matrix)

예제 #24

0

파일 보기

파일: visualize.py 프로젝트: proboscis/GradProject

def MDSPlots(images,compressed):
    """
    generator of pyplot figures
    """
    from sklearn.manifold import MDS
    mds = MDS(n_components = 2,dissimilarity = "precomputed")
    print "calculating similarities"
    from scipy.spatial.distance import squareform, pdist
    #similarities = squareform(pdist(compressed,'mahalanobis'))
    similarities = squareform(pdist(compressed,'euclidean'))
    print "fitting mds"
    coords = mds.fit_transform(similarities)

    import visualize as viz
    print "create figure"
    fig = viz.imgScatter(coords,images)
    return fig

예제 #25

0

파일 보기

파일: dim_reduction.py 프로젝트: shirleyahn/CAS_Code

def multidimensioanl_scaling(file_name, dimension, label):
    balls = np.loadtxt(file_name)
    matrix = balls[:, 0:dimension]
    new_matrix = convert_angles_to_cos_sin(matrix)
    mds = MDS(n_components=2, metric=True, n_init=4, max_iter=300, verbose=0, eps=1e-6, n_jobs=1, random_state=None,
              dissimilarity='euclidean')
    transformed_matrix = mds.fit_transform(new_matrix)
    ball_coords = np.zeros((balls.shape[0], dimension+3))
    for i in xrange(balls.shape[0]):
        ball_coords[i, 0:dimension] = balls[i, 0:dimension].tolist()
        ball_coords[i, dimension:dimension+2] = transformed_matrix[i]
        if label == 'cluster':
            ball_coords[i, dimension+2] = balls[i, dimension].tolist()
        elif label == 'eq':
            ball_coords[i, dimension+2] = (-0.0019872041*300*np.log(abs(balls[i, dimension+1]))).tolist()
        elif label == 'committor':
            ball_coords[i, dimension+2] = (balls[i, dimension+2]/abs(balls[i, dimension+1])).tolist()
        print ' '.join([str(x) for x in ball_coords[i, :]])

예제 #26

0

파일 보기

파일: mds.py 프로젝트: Wangmoaza/tetra

def perform(rank_num, group, minCnt=30, dim=2, tsne=False):
    method = "MDS"
    X, y, names = load_data(tetraFile='db2_tetra_{0}_{1}.npy'.format(ranks[rank_num], group),
                            taxonFile='db2_taxon_{0}_{1}.npy'.format(ranks[rank_num], group), 
                            rank=rank_num + 1, minCnt=minCnt)
    
    if X.shape[0] == 0:
        print "Error: sample size is 0"
        return

    mds = MDS(n_components=2, n_init=1, max_iter=100)
    X_2d = mds.fit_transform(X)
    title = "MDS of TNA of " + group
    figName = "MDS_{0}_{1}".format(ranks[rank_num][0], group)

    plot2d(X_2d, y, names, title, figName)

    print '...clustering'
    name = '{0}_{1}'.format(ranks[rank_num][0], group)
    clust = Clustering(name, method, X_2d, y)
    #d_ami, d_nmi, d_vmes, d_ari = clust.dbscan(5)
    eval_tuple = clust.agglomerative(linkage='ward', connect=True)
    clust.plotCluster(alg='ward')

    with open('result_score.txt', 'a') as f:
        mse_result = '-'
        general_str = "{rank}\t{group}\t{method}\t{size}\t{n_cluster}\t{mse}\t{dim}\t".format(rank=ranks[rank_num],
                                                                        group=group, method=method,                                                                                     n_cluster=np.unique(y).shape[0],
                                                                        dim=dim, size=X_2d.shape[0],
                                                                        mse=mse_result)
        #d_values_str = "{ami:0.3f}\t{nmi:0.3f}\t{vmes:0.3f}\t{ari:0.3f}\t".format(ami=d_ami, nmi=d_nmi,
        #                                                                            vmes=d_vmes, ari=d_ari)
        #line = general_str + d_values_str
        line = general_str
        
        for i in range(len(eval_tuple) / 4):
            a_ami, a_nmi, a_vmes, a_ari = eval_tuple[i], eval_tuple[i+1], eval_tuple[i+2], eval_tuple[i+3]
            a_values_str = "{ami:0.3f}\t{nmi:0.3f}\t{vmes:0.3f}\t{ari:0.3f}\t".format(ami=a_ami, nmi=a_nmi,
                                                                                        vmes=a_vmes, ari=a_ari)
            line += a_values_str
        ### END - for i
        line += '\n'
        f.write(line)

예제 #27

0

파일 보기

파일: text_compare.py 프로젝트: levithatcher/textcompare

    def plotlyplt(self):

        # Find full-D distances between texts
        dist = 1 - cosine_similarity(self.dtm)

        # Dimensionality reduction to 3-D
        mds = MDS(n_components=3, dissimilarity="precomputed", random_state=1)
        pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
        pos = np.array(pos)

        # Get array of names to show on plot
        self.names = [os.path.basename(fn).replace('.txt', '') for fn in self.filenames]

        scatter = dict(
            mode = "markers+text",
            name = "y",
            type = "scatter3d",
            x = pos[:, 0], y = pos[:, 1], z = pos[:, 2],
            text=self.names,
            textfont=dict(size=8),
            marker = dict( size=2, color="rgb(23, 190, 207)" )

        )
        # clusters = dict(
        #     alphahull = 7,
        #     name = "y",
        #     opacity = 0.1,
        #     type = "mesh3d",
        #     x = pos[:, 0], y = pos[:, 1], z = pos[:, 2]
        # )
        layout = dict(
            title = 'cosine distance between texts',
            scene = dict(
                xaxis = dict( zeroline=False ),
                yaxis = dict( zeroline=False ),
                zaxis = dict( zeroline=False ),
            )
        )
        #fig = dict( data=[scatter, clusters], layout=layout )
        fig = dict( data=[scatter], layout=layout )
        # Use py.iplot() for IPython notebook
        url = py.plot(fig, filename='3d point clustering cosine distance')

예제 #28

0

파일 보기

파일: channel_util.py 프로젝트: Qi0116/deepthought

def reorder_channels_by_similarity(data, channel_names=None, normalize=True):
    """
    :param data: 2-d array in the format [n_samples, n_channels]
    :param channel_names: names of the EEG channels
    :param normalize: if True, normalize data first before computing distances
    :return: data, channel_names permutated accordingly
    """

    # work on transposed view
    data = data.T

    # normalize first
    if normalize:
        # work on a copy
        data_copy = data.copy()
        for c in xrange(data_copy.shape[0]):
            data_copy[c] -= data_copy[c].mean()
            data_copy[c] /= data_copy[c].std()
    else:
        data_copy = data

    # project to 1-d
    from sklearn.manifold import MDS
    mds = MDS(n_components=1)
    projection = mds.fit_transform(data_copy).reshape(data_copy.shape[0])
    # print p.shape

    order = np.argsort(projection)
    # print order

    # the operation is not happening "in-place": a copy of the
    # subarray in sorted order is made, and then its contents
    # are written back to the original array
    data = data[order]

    # restore initial axes layout
    data = data.T

    # re-order channel_names
    channel_names = reorder_channel_names(channel_names, order)

    return data, channel_names

예제 #29

0

파일 보기

파일: chordmap.py 프로젝트: howthebodyworks/pattern_machine

def get_mds(dist_matrix, n_dims=3, metric=True, rotate=True, normalize=True, random_state=SEED):
    transformer = MDS(
        n_components=n_dims,
        metric=metric,
        n_init=4,
        max_iter=300,
        verbose=1,
        eps=0.001,
        n_jobs=3,
        dissimilarity="precomputed",
        random_state=random_state,
    )
    transformed = transformer.fit_transform(dist_matrix)
    if rotate:
        # Rotate the data to a hopefully consistent orientation
        clf = PCA(n_components=n_dims)
        transformed = clf.fit_transform(transformed)
    if normalize:
        transformed = normalize_var(transformed)
    return transformed

예제 #30

0

파일 보기

파일: analysis.py 프로젝트: oew1v07/hierarchical

def multidim(X, vectorizerType="tf", notitles=False, metric="euclidean"):
    """Multidimensional scaling on a books x word count array

    Args
    ----
    X: ndarray
        The array of term frequencies or TF-IDF

    Returns
    out: ndarray

    """
    multi = MDS()

    # Provides the points to plot each of the books
    out = multi.fit_transform(X)

    min_x, min_y = np.min(out, axis=0)
    max_x, max_y = np.max(out, axis=0)

    plt.clf()
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.ylim((min_y-0.5, max_y+0.5))
    plt.xlim((min_x-0.5, max_x+0.5))
    plt.xlabel('x')
    plt.ylabel('y')
    plt.title('Multi-Dimensional Scaling of Antiquity Texts')

    for i, book in enumerate(abb):
        ax.annotate(book, xy=out[i])
    
    plt.tight_layout()

    if notitles:
        name = "MDS_{}_notitles_{}.pdf".format(vectorizerType, metric)
    else:
        name = "MDS_{}_{}.pdf".format(vectorizerType, metric)
    plt.savefig(name)
    return out

예제 #31

0

파일 보기

n = 20
print 'top %d terms pr. group' %n , '\n'
for i in range(k):
    print 'group %d content:' % i, '\n'
    for ind in order_centroids[i, :n]:
        print ' %s' % terms1[ind] + ', '
    print '-----'
len(terms1)
## MDS for vis, mds is inefficient for medium to large data sets, use pcs/simple mds instead
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

mds = MDS(n_components = 2, dissimilarity='precomputed', random_state = 1)
mdl2 = mds.fit_transform(dist)
xs, ys = mdl2[:,0], mdl2[:,1]

group_col = {0:'#1b9e77',1:'#d95f02'}
group_nom = {0: 'group1', 1:'group2'}

df = pd.DataFrame(dict(x=xs, y=ys, label=class_mdl1))
df.head()
groups = df.groupby('label')

# set up plot
fig, ax = plt.subplots(figsize=(9, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,

예제 #32

0

파일 보기

    def transform(self, graph_file, first_node=None):
        logging.info('loading graph')
        """
        input: csv file of graph; formate: start_node, end_node, weight
        output: graph, a list, the elements are tuples, like [(1, 2, 1) (3, 1, 1) (2, 3, 1)]
        count amount of nodes from G
        """
        self.graph = self.load_graph(graph_file)  # obtain a array of graph

        self.node_count = self.find_node_count(
            self.graph)  # find the number of nodes in graph
        self.edge_count = len(self.graph)
        print("nodes:", self.node_count)
        print("edges:", self.edge_count)
        self.node_range = range(1, self.node_count + 1)

        logging.info('computing distance matrix')
        self.distance_matrix = self.compute_distance_matrix(
            self.graph, self.node_count)
        # self.distance_matrix = self.nomalization_distance_mtrix(distance_matrix=self.distance_matrix) # nomalized distance matrix
        ##############################  adjacency matrix ##########################################
        self.adjacency_matrix = self.get_adjacency_matrix(
            self.graph, self.node_count)
        ###########################################################################
        if first_node is None:
            """self.first_node = randint(0, self.node_count) + 1  # Choose the first pivot from V randomly."""
            self.first_node = randint(1, self.node_count)
        else:
            self.first_node = first_node  # Specify the first pivot.

        logging.info('finding pivots')
        """
        dimensions=m
        choose m pivots according to k-center.         
        """
        #####################################################
        if self.pivot_select == "randomly":
            self.pivot_nodes = self.choose_pivots_randomly(
                dimension=self.dimension, number_nodes=self.node_count)
        #####################################################
        else:
            self.pivot_nodes = self.choose_pivot_points(
                self.graph, self.dimension)  # self.pivot_nodes: a list

        logging.info('drawing graph in high dimensional space')
        """
        note that the number of pivot nodes is the same as dimension.
        formate of points:
        G=(V, E)
        |V|=n, dimensions = m = pivots
        d(vi, pj) denotes a distance computered by Dijkstra's algorithm in a G.

           p1          p2          p3     ...    pm
        v1 d(v1, p1) d(v1, p2)  d(v1, p3)     d(v1, pm)
        v2  .
        v3  .
        v4  .                                   .
        .   .                                   .
        .   .                                   .
        .   .
        vn d(vn, p1)       ...                d(vn, pm)


        """
        self.points = list(
            map(
                lambda i: tuple(self.distance_matrix[i - 1, p - 1]
                                for p in self.pivot_nodes), self.node_range))

        if self.normalization is True:
            ##############################################################################################################
            self.points = self.nomalization_distance_mtrix(
                distance_matrix=self.points)  # nomalized self.points
            ##############################################################################################################
        logging.info('project into a low dimension use PCA')

        if self.version == "HDE-SV":
            if self.dimension == 2:
                self.transformed_points = np.array(self.points)
        """
        PCA:
            input  array-like:  shape of self.points = (n_sample, n_feature)
            output array-like:  shape of self.transformed_points = (n_sample, n_component)

        """
        if self.version == "HDE":  # PCA denotes that algorithm uses PCA to decomposite original space.
            pca = PCA(n_components=2, copy=True)
            self.transformed_points = pca.fit_transform(self.points)

        if self.version == "HDE-Level":  # PCA denotes that algorithm uses PCA to decomposite original space.
            pca = PCA(n_components=3, copy=True)
            self.transformed_points = pca.fit_transform(self.points)
            pca = PCA(n_components=2, copy=True)
            self.transformed_points = pca.fit_transform(
                self.transformed_points)
        '''
          replace initial version as paper. by mty 2017-8-9
        '''
        if self.version == "HDE-PIT":  # PIT denotes that algorithm uses poweriteration to computer eigenvectors for decomposition space.
            X, S = self.covariance(self.points)
            # X = np.array(self.points).T
            # X = X.astype(float)
            U = self.poweriteration(S, epsilon=self.epsilon)
            self.transformed_points = self.decomposition_space(X, U)
            if self.node_count == (self.edge_count +
                                   1):  # determine wether it is a tree.
                FR = FR_Algorithm(number_of_nodes=self.node_count,
                                  initial_temperature=self.initial_temperature,
                                  cooling_factor=self.cooling_factor,
                                  factor_attract=self.factor_attract,
                                  factor_repulsion=self.factor_repulsion)
                # use FR to fine-tune
                self.transformed_points = FR.apply_force_directed_algorithm(
                    iteration=self.fr_iteration,
                    graph=self.graph,
                    coord_decomposition=self.transformed_points)

        if self.version == "HDE-MDS":  # HDE-MDS denotes that algorithm combines with MDS.
            hde_mds = MDS()  # MDS object
            self.transformed_points = hde_mds.fit_transform(self.points)

        if self.version == "Pivot-MDS":  # Pivot-MDS denotes that original version of Pivot MDS.
            pivot_mds = PivotMDS(d=self.distance_matrix,
                                 pivots=self.dimension)  # PivotMDS object
            self.transformed_points = pivot_mds.optimize()

        if self.version == "HDE-FICA":  # FICA denotes that algorithm uses Fast ICA to decomposite original space.
            #  fun, Could be either 'logcosh', 'exp', or 'cube'.
            fica = FastICA(n_components=2)
            # print(np.array(self.points).shape)
            self.transformed_points = fica.fit_transform(self.points)
            # print(np.array(self.transformed_points).shape)
            # FR = FR_Algorithm(number_of_nodes=self.node_count, initial_temperature=self.initial_temperature,
            #                   cooling_factor=self.cooling_factor, factor_attract=self.factor_attract, factor_repulsion=self.factor_repulsion)
            # # use FR to fine-tune
            # self.transformed_points = FR.apply_force_directed_algorithm(iteration=self.fr_iteration, graph=self.graph, coord_decomposition=self.transformed_points)

        if self.version == "HDE-KPCA":  # FPCA denotes that algorithm uses kernel PCA to decomposite original space.
            kpca = KernelPCA(n_components=2,
                             kernel=self.kpca_fun,
                             gamma=self.gamma)
            self.transformed_points = kpca.fit_transform(self.points)

        if self.version == "HDE-NMF":
            nmf = NMF(n_components=2)
            self.transformed_points = nmf.fit_transform(self.points)

        if self.version == "HDE-TruncatedSVD":
            tsvd = TruncatedSVD(n_components=2)
            self.transformed_points = tsvd.fit_transform(self.points)

        if self.version == "HDE-LDA":
            lda = LinearDiscriminantAnalysis(n_components=2)
            y = []
            for i in range(self.node_count):
                y.append(1)
            y = np.array(y)
            lda = lda.fit(self.points, y=y)
            self.transformed_points = lda.transform(self.points)
        if self.version == "HDE-FR":

            pca = PCA(n_components=2, copy=True)
            self.transformed_points = pca.fit_transform(self.points)
            if self.node_count == (self.edge_count +
                                   1):  # determine wether it is a tree.
                FR = FR_Algorithm(number_of_nodes=self.node_count,
                                  initial_temperature=self.initial_temperature,
                                  cooling_factor=self.cooling_factor,
                                  factor_attract=self.factor_attract,
                                  factor_repulsion=self.factor_repulsion)
                # use FR to fine-tune
                self.transformed_points = FR.apply_force_directed_algorithm(
                    iteration=self.fr_iteration,
                    graph=self.graph,
                    coord_decomposition=self.transformed_points)

        if self.version == "HDE-FICA-FR":

            fica = FastICA(n_components=2)
            self.transformed_points = fica.fit_transform(self.points)
            if self.node_count == (self.edge_count +
                                   1):  # determine wether it is a tree.
                FR = FR_Algorithm(number_of_nodes=self.node_count,
                                  initial_temperature=self.initial_temperature,
                                  cooling_factor=self.cooling_factor,
                                  factor_attract=self.factor_attract,
                                  factor_repulsion=self.factor_repulsion)
                # use FR to fine-tune
                self.transformed_points = FR.apply_force_directed_algorithm(
                    iteration=self.fr_iteration,
                    graph=self.graph,
                    coord_decomposition=self.transformed_points)

        if self.version == "HDE-TSNE-FR":
            # pca = PCA(n_components=10, copy=True)
            # self.transformed_points = pca.fit_transform(self.points)
            tsne = TSNE(learning_rate=self.learning_rate, init=self.init
                        )  # 'init' must be 'pca', 'random', or a numpy array
            self.transformed_points = tsne.fit_transform(self.points)
            if self.node_count == (self.edge_count +
                                   1):  # determine wether it is a tree.
                FR = FR_Algorithm(number_of_nodes=self.node_count,
                                  initial_temperature=self.initial_temperature,
                                  cooling_factor=self.cooling_factor,
                                  factor_attract=self.factor_attract,
                                  factor_repulsion=self.factor_repulsion)
                # use FR to fine-tune
                self.transformed_points = FR.apply_force_directed_algorithm(
                    iteration=self.fr_iteration,
                    graph=self.graph,
                    coord_decomposition=self.transformed_points)

        if self.version == "HDE-SPE":
            IP = SpectralEmbedding(n_components=2)
            self.transformed_points = IP.fit_transform(self.distance_matrix)
            # pca = PCA(n_components=2, copy=True)
            # self.transformed_points = pca.fit_transform( self.transformed_points)

        return self.node_count, self.edge_count

예제 #33

0

파일 보기

y = clusters #predicted labels
error_analysis = contingency_matrix(x, y)
#***************************plot************************************************

from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(X)


import matplotlib.pyplot as plt
from sklearn.manifold import MDS
MDS()
# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
xs, ys = pos[:, 0], pos[:, 1]


#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3'}

#set up cluster names using a dict
cluster_names = {0: 'first book',
                 1: 'third book',
                 2: 'second book'}


#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters))

예제 #34

0

파일 보기

def apply_pca_analysis(df, params):
    '''df is held_units dataframe grouped by exp_name, exp_group, time_group
    only contains units held over preCTA or postCTA, no units held from ctaTrain to ctaTest
    
    Parameters
    ----------
    
    
    Returns
    -------
    
    
    Raises
    ------
    
    '''
    bin_size = params['pca']['win_size']
    bin_step = params['pca']['step_size']
    time_start = params['pca']['time_win'][0]
    time_end = params['pca']['time_win'][1]
    smoothing = params['pca']['smoothing_win']
    n_cells = len(df)

    rd1 = df['rec1'].unique()
    rd2 = df['rec2'].unique()
    if len(rd1) > 1 or len(rd2) > 1:
        raise ValueError('Too many recording directories')

    rd1 = rd1[0]
    rd2 = rd2[0]
    units1 = list(df['unit1'].unique())
    units2 = list(df['unit2'].unique())
    dim1 = load_dataset(rd1).dig_in_mapping.set_index('channel')
    dim2 = load_dataset(rd2).dig_in_mapping.set_index('channel')
    if n_cells < 2:
        # No point if only 1 unit
        exp_name = os.path.basename(rd1).split('_')
        print('%s - %s: Not enough units for PCA analysis' %
              (exp_name[0], exp_name[-3]))
        return

    time, sa = h5io.get_spike_data(rd1, units1)
    fr_t, fr, fr_lbls = get_pca_data(rd1,
                                     units1,
                                     bin_size,
                                     step=bin_step,
                                     t_start=time_start,
                                     t_end=time_end)
    rates = fr
    labels = fr_lbls
    time = fr_t
    # Again with rec2
    fr_t, fr, fr_lbls = get_pca_data(rd2,
                                     units2,
                                     bin_size,
                                     step=bin_step,
                                     t_start=time_start,
                                     t_end=time_end)
    rates = np.vstack([rates, fr])
    labels = np.vstack([labels, fr_lbls])
    # So now rates is tastes*trial*times X units

    # Do PCA on all data, put in (trials*time)xcells 2D matrix
    # pca = MDS(n_components=2)
    pca = PCA(n_components=2)
    pc_values = pca.fit_transform(rates)
    mds = MDS(n_components=2)
    md_values = mds.fit_transform(rates)

    out_df = pd.DataFrame(labels, columns=['taste', 'trial', 'time'])
    out_df['n_cells'] = n_cells
    out_df[['PC1', 'PC2']] = pd.DataFrame(pc_values)
    out_df[['MDS1', 'MDS2']] = pd.DataFrame(md_values)

    # Compute the MDS distance metric using the full dimensional solution
    # For each point computes distance to mean Quinine / distance to mean NaCl
    mds = MDS(n_components=rates.shape[1])
    mds_values = mds.fit_transform(rates)
    n_idx = np.where(labels[:, 0] == 'NaCl')[0]
    q_idx = np.where(labels[:, 0] == 'Quinine')[0]
    q_mean = np.mean(mds_values[q_idx, :], axis=0)
    n_mean = np.mean(mds_values[n_idx, :], axis=0)
    dist_metric = [
        euclidean(x, q_mean) / euclidean(x, n_mean) for x in mds_values
    ]
    assert len(
        dist_metric) == rates.shape[0], 'computed distances over wrong axis'
    out_df['dQ_v_dN_fullMDS'] = pd.DataFrame(dist_metric)

    # Do it again with raw rates
    q_mean = np.mean(rates[q_idx, :], axis=0)
    n_mean = np.mean(rates[n_idx, :], axis=0)
    raw_metric = [euclidean(x, q_mean) / euclidean(x, n_mean) for x in rates]
    assert len(
        raw_metric) == rates.shape[0], 'computed distances over wrong axis'
    out_df['dQ_v_dN_rawRates'] = pd.DataFrame(raw_metric)

    return out_df

예제 #35

0

파일 보기

    def project_face_space(self, kind='custom', **kwargs):
        ''' Create a leave-one-out space for each target stimulus      
        '''
        if kind == 'custom':
            # creating a target face space with all target faces
            target_all_space = self._cmdscale(
                self.rc_df)[0]  # creating face space for all recon faces
            target_all_space_df = pd.DataFrame(
                target_all_space[:, :self.dims_n], index=list(self.rc_names))

            for name in self.rc_names:  # looping over all target faces

                # computing training and target conf df without loo name
                train_conf_df = self.tr_df.drop(columns=name,
                                                index=name).copy()

                # names without loo name
                train_names = train_conf_df.index

                # computing face spaces without loo
                target_space = target_all_space_df.drop(index=name)
                target_names = target_space.index
                train_space = pd.DataFrame(
                    self._cmdscale(train_conf_df)[0][:, :self.dims_n],
                    index=train_names)
                train_target_space = train_space.loc[target_names, :].values

                # computing procrustes projection using default code ?
                R, s = orthogonal_procrustes(train_target_space,
                                             target_space)  #?
                train_space.loc[name] = np.dot(
                    target_all_space_df.loc[name, :], R.T) * s

                # computing procrustes projection using Matlab imported code
                #                 d, Z, tform = self.procrustes(target_space.values, train_target_space)
                #                 train_space.loc[name] = tform['scale'] * np.dot(target_all_space_df.loc[name,:], tform['rotation']) + tform['translation']

                # sorting and assigning to the dictionary
                train_space = train_space.sort_index(axis=0)
                self.loo_dict[name] = train_space
        else:
            mds_scale = MDS(n_components=self.dims_n,
                            dissimilarity='precomputed',
                            n_init=10,
                            max_iter=1000)
            all_rc = mds_scale.fit_transform(self.rc_df.values)
            all_rc = pd.DataFrame(all_rc[:, :self.dims_n],
                                  index=list(self.rc_names))
            for name in self.rc_names:  # looping over all recon faces
                temp_rc_names = set(self.rc_names).difference(
                    {name})  # excluding leave-one-out face name
                temp_tr = self.tr_df.drop(
                    columns=name, index=name
                )  # excluding loo row and column in training data
                temp_tr_names = temp_tr.index  # names left for training data
                temp_tr = pd.DataFrame(
                    mds_scale.fit_transform(temp_tr),
                    index=temp_tr_names)  # mds on training data
                temp_rc_tr = temp_tr.loc[
                    temp_rc_names, :].values  # matching faces from  training face space to recon
                temp_rc = self.rc_df.loc[
                    temp_rc_names,
                    temp_rc_names]  # choosing recon confusibility matrix
                temp_rc = mds_scale.fit_transform(
                    temp_rc)  # running mds on recon data
                R, s = orthogonal_procrustes(temp_rc_tr,
                                             temp_rc[:, :self.dims_n])
                temp_tr.loc[name] = np.dot(all_rc.loc[name, :], R) * s
                temp_tr = temp_tr.sort_index(axis=0)
                self.loo_dict[name] = temp_tr
        return self

예제 #36

0

파일 보기

import pickle_file
from models import hierarchy

from feature_extraction import doc2vec
from feature_extraction import tf_idf

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS

books = pickle_file.load('books.pkl')
doc = books['contents'][1:].tolist()

# Use doec2vec to perform feature extracture
feature_vectors = pickle_file.load('doec2vec.pkl')

# Use tf-idf to extract the features and calculate the dissimilarity bt cosine simularity
vectorizer, corpus_matrix, feature_names = tf_idf.get_tfidf_model(doc)
distVectors = 1 - cosine_similarity(corpus_matrix)

mds = MDS(n_components=2000,
          random_state=1,
          dissimilarity="precomputed",
          metric=True)
bookFeatures = mds.fit_transform(distVectors)

# output using feature extractor: doc2vec
hierarchy.ward_cluster(6, feature_vectors, 'doc2vec')

# output using feature extractor: tf-idf
hierarchy.ward_cluster(6, bookFeatures, 'tf-idf')

예제 #37

0

파일 보기

파일: 3_2_Multi_Dimensional_Scaling.py 프로젝트: iubh/DLBDSMLUSL01

# IU - International University of Applied Science
# Machine Learning - Unsupervised Machine Learning
# Course Code: DLBDSMLUSL01

# Multi-Dimensional Scaling (MDS)

#%% import libraries
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.preprocessing import MinMaxScaler

#%% load the sample data set
iris = datasets.load_iris()
X = iris.data

#%% normalize the data
X_scaled = MinMaxScaler().fit_transform(X)

#%% conduct MDS on the data
mds = MDS(2,random_state=0)
X_2d = mds.fit_transform(X_scaled)

#%%
# Plot the projected Iris data points into the reduced
# feature space by MDS 
plt.scatter(x=X_2d[:,0], y=X_2d[:,1], c=iris.target)
plt.show()

예제 #38

0

파일 보기

파일: multidimensional_scaling_mean.py 프로젝트: sulcgroup/oxdna_analysis_tools

    #lle = LocallyLinearEmbedding(n_neighbors=5, n_components=3, eigen_solver='arpack', max_iter=3000)
    #lle = LocallyLinearEmbedding(n_components=3, eigen_solver='arpack', geom=geom)
    #out_coords = lle.fit_transform(masked_mean, input_type='adjacency')
    #out_coords = lle.fit_transform(masked_mean)
    #init = np.array([p.cm_pos for p in out_conf._nucleotides])

    #Run multidimensional scaling on the average distances to find average positions
    from sklearn.manifold import MDS
    mds = MDS(n_components=3,
              metric=True,
              max_iter=3000,
              eps=1e-12,
              dissimilarity="precomputed",
              n_jobs=1,
              n_init=1)
    out_coords = mds.fit_transform(
        masked_mean)  #, init=init) #this one worked best

    #Overwrite the system we made earlier with the coordinates calculated via MDS
    for i, n in enumerate(output_system._nucleotides):
        n.cm_pos = out_coords[i]
        n._a1 = np.array([0, 0, 0])
        n._a3 = np.array(
            [0, 0, 0]
        )  #since the orientation vectors are all 0, this cannot be used in a simulation, but the viewer will handle it

    #Write the mean structure out as a new .dat and .top pair
    output_system.print_lorenzo_output("{}.dat".format(meanfile),
                                       "{}.top".format(meanfile))
    print("INFO: wrote output files: {}.dat, {}.top".format(
        meanfile, meanfile),
          file=stderr)

예제 #39

0

파일 보기

    fasta_sequences = SeqIO.parse(open('HW4.fas'), 'fasta')
    for fasta in fasta_sequences:
        seqList.append(fasta.seq)

    # initializing the hamming list of elements to zeros
    hammList = np.zeros((len(seqList), len(seqList)))

    # building the hamming distance matrix
    # the shape of this matrix is 120 X 120
    for i in range(len(seqList)):
        for j in range(len(seqList)):
            hammingDist = calcHammDist(seqList[i], seqList[j])
            hammList[i][j] = hammingDist

    # performing the  multi-dimensional scaling on the hamming distance matrix
    # n_components = 2 - scaling the dimensions to two
    # dissimilarity = 'precomputed' - means that we are specifying the mds that
    # dissimilarities is already computed in the form of pairwise hamming distances
    embedding = MDS(n_components=2, dissimilarity='precomputed')
    hamm_transformed = embedding.fit_transform(hammList)

    # forming the x-coordinates and y-coordinates
    x = hamm_transformed[:, 0]
    y = hamm_transformed[:, 1]

    # performing the visualization
    scatterPlotVisualizer(x, y)

    # write MDS Data to CSV file
    writeMDSDataToCSV(hamm_transformed)

예제 #40

0

파일 보기

파일: document_clustering-kMeans-sita-req.py 프로젝트: runapal5/intelligentTestExecutor

def plot_clusters(num_clusters,
                  feature_matrix,
                  cluster_data,
                  movie_data,
                  plot_size=(16, 8)):
    # generate random color for clusters
    def generate_random_color():
        color = '#%06x' % random.randint(0, 0xFFFFFF)
        return color

    # define markers for clusters
    markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd']
    # build cosine distance matrix
    cosine_distance = 1 - cosine_similarity(feature_matrix)
    # dimensionality reduction using MDS
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    # get coordinates of clusters in new low-dimensional space
    plot_positions = mds.fit_transform(cosine_distance)
    x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1]
    # build cluster plotting data
    cluster_color_map = {}
    cluster_name_map = {}
    for cluster_num, cluster_details in cluster_data.items():
        # assign cluster features to unique label
        cluster_color_map[cluster_num] = generate_random_color()
        cluster_name_map[cluster_num] = ', '.join(
            cluster_details['key_features'][:5]).strip()
    # map each unique cluster label with its coordinates and movies
    cluster_plot_frame = pd.DataFrame({
        'x':
        x_pos,
        'y':
        y_pos,
        'label':
        movie_data['Cluster'].values.tolist(),
        'title':
        movie_data['Title'].values.tolist()
    })
    grouped_plot_frame = cluster_plot_frame.groupby('label')
    # set plot figure size and axes
    fig, ax = plt.subplots(figsize=plot_size)
    ax.margins(0.05)
    # plot each cluster using co-ordinates and movie titles
    for cluster_num, cluster_frame in grouped_plot_frame:
        marker = markers[cluster_num] if cluster_num < len(markers) \
                 else np.random.choice(markers, size=1)[0]
        ax.plot(cluster_frame['x'],
                cluster_frame['y'],
                marker=marker,
                linestyle='',
                ms=12,
                label=cluster_name_map[cluster_num],
                color=cluster_color_map[cluster_num],
                mec='none')
        ax.set_aspect('auto')
        ax.tick_params(axis='x',
                       which='both',
                       bottom='off',
                       top='off',
                       labelbottom='off')
        ax.tick_params(axis='y',
                       which='both',
                       left='off',
                       top='off',
                       labelleft='off')
    fontP = FontProperties()
    fontP.set_size('small')
    ax.legend(loc='upper center',
              bbox_to_anchor=(0.5, -0.01),
              fancybox=True,
              shadow=True,
              ncol=5,
              numpoints=1,
              prop=fontP)
    #add labels as the film titles
    for index in range(len(cluster_plot_frame)):
        ax.text(cluster_plot_frame.ix[index]['x'],
                cluster_plot_frame.ix[index]['y'],
                cluster_plot_frame.ix[index]['title'],
                size=8)
    plt.savefig('clusters_requirementUC.png', dpi=200)
    # show the plot
    plt.show()

예제 #41

0

파일 보기

파일: 02_investigating_dirty_categories.py 프로젝트: dirty-cat/dirty-cat.github.io

from dirty_cat import SimilarityEncoder

similarity_encoder = SimilarityEncoder(similarity='ngram')
transformed_values = similarity_encoder.fit_transform(
    sorted_values.reshape(-1, 1))

#########################################################################
# Plotting the new representation using multi-dimensional scaling
# ................................................................
#
# Let's now plot a couple points at random using a low-dimensional representation
# to get an intuition of what the similarity encoder is doing:
from sklearn.manifold import MDS

mds = MDS(dissimilarity='precomputed', n_init=10, random_state=42)
two_dim_data = mds.fit_transform(1 -
                                 transformed_values)  # transformed values lie
# in the 0-1 range, so 1-transformed_value yields a positive dissimilarity matrix
print(two_dim_data.shape)
print(sorted_values.shape)

#########################################################################
# We first quickly fit a KNN so that the plots does not get too busy:
import numpy as np

n_points = 5
np.random.seed(42)
from sklearn.neighbors import NearestNeighbors

random_points = np.random.choice(len(similarity_encoder.categories_[0]),
                                 n_points,
                                 replace=False)

예제 #42

0

파일 보기

    ))

    fig.update_layout(title="heat map", xaxis_nticks=36)

    fig.show()
    # parallel
    fig = px.parallel_coordinates(
        csv_mydata,
        color_continuous_scale=px.colors.diverging.Tealrose,
        color_continuous_midpoint=2)
    fig.show()

    # get PCA and MDS data
    mds = MDS(n_components=2)
    mds.fit(data)
    mds_data = mds.fit_transform(data)
    pca = PCA(n_components=2)
    pca.fit(data)
    pca_data = pca.fit_transform(data)

    # PCA
    csv_mydata['pca_x'] = np.array(pca_data).T[0]
    csv_mydata['pca_y'] = np.array(pca_data).T[1]
    fig = px.scatter(csv_mydata,
                     x="pca_x",
                     y="pca_y",
                     color="year",
                     title="PCA chart")
    fig.show()

    # MDS

예제 #43

0

파일 보기

파일: code_09_dimension_reduction.py 프로젝트: trifen/text_ml_course_2018

X95 = pca.fit_transform(X)
pca.n_components_
#%% PCA Inverse Transform
Xrestore = pca.inverse_transform(X95)
plt.plot(Xrestore[0],X[0],'ro')
#%% Incremental PCA
X_mm = np.memmap('X.pkl',shape=(32567, 472))

from sklearn.decomposition import IncrementalPCA
inc_pca = IncrementalPCA(n_components=100, batch_size=1000)
inc_pca.fit(X_mm)

#%% MDS, Isomap, and T-SNE
from sklearn.manifold import MDS, Isomap, TSNE
mds = MDS(n_components=2)
Xmds = mds.fit_transform(X[:500,:200])
Axes3D(plt.figure()).scatter(Xmds[:,0],Xmds[:,1], alpha=.3)
#%% Isomap
iso = Isomap(n_components=2)
Xiso = iso.fit_transform(X[:500,:200])
Axes3D(plt.figure()).scatter(Xiso[:,0],Xiso[:,1], alpha=.3)
#%% t-SNE
tsne = TSNE(n_components=2, n_iter=250)
Xtsne = tsne.fit_transform(X[:500,:200])
Axes3D(plt.figure()).scatter(Xtsne[:,0],Xtsne[:,1], alpha=.3)
#%% PC Regression
lin_reg = LinearRegression()
scores = cross_val_score(lin_reg,
                         X95[:,:10],
                         Y) 
scores.mean()

예제 #44

0

파일 보기

def calculate_and_cluster():

    global names
    global data_list
    global data_tag_map
    global matrix_list
    global data_tagged_list

    data_list = {}
    data_tag_map = {}
    data_tagged_list = {}
    matrix_list = []

    counter = 0
    # Parse the CSV file (this will be denoted by a string variable)
    with open('../../../data/sets/complete_set.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            data_list[counter] = ''.join(row)
            counter += 1

    # Loop through data in range
    for data in range(0, len(data_list)):
        # Split the last token in the string
        split = data_list[data].split(" ")[-1:]
        # print split[0], "Tag set: ", get_tag_set(split[0])
        data_tag_map[split[0]] = get_tag_set(split[0])
    od = OrderedDict(sorted(data_tag_map.items()))

    names = []
    counter = 0
    for key, value in od.iteritems():
        # Maintain old file name
        file_old = str(counter) + '.txt'
        tag = ''
        if len(value) == 1:
            tag = 'Tagged'
            names.append(str(counter) + "_" + tag)
            data_tagged_list[str(counter)] = True
        else:
            tag = 'Untagged'
            names.append(str(counter) + "_" + tag)
            data_tagged_list[str(counter)] = False

        # Create new file name with tagged / untagged appended
        file_new = str(counter) + '_' + tag + '.txt'
        # Rename the file for later use in color co-ordination
        rename_file(file_old, file_new)
        counter += 1

    dataNodes = []
    for x in range(0, len(data_list)):
        dataNodes.append(data_list[x])

    # Generate matrix from file
    X = genfromtxt('matrix.csv', delimiter=',')

    # Symmetrize X to ensure the matrix is valid
    X = symmetrize(X)

    # Put matrix in a list for checking
    matrix_list = X.tolist()

    for x in range(0, len(matrix_list)):
        tagged = get_tagged(str(x))
        if (not tagged):
            tag_nearest_neighbour(x)

    # Check symmetry
    print "Symmetric? " + str((X.transpose() == X).all())
    # N Components: plotting points in a two-dimensional plane
    # Dissimilirity: "precomputed" because of the Distance Matrix
    # Random state is fixed so we can reproduce the plot.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    mds.fit(X.astype(np.float64))
    pos = mds.fit_transform(X)  # shape (n_components, n_samples)
    xs, ys = pos[:, 0], pos[:, 1]

    # Set figure size to have dimensions of at least 15 inches for the width.
    # Height can be scaled accordingly.
    plt.figure(figsize=(15, 8))
    plt.subplot(211)

    # Loop through the points, label appropriately and scatter
    # Ensure figure size has enough room for legend plotting. Each plot must have a label.
    # In this case, label is the split value denoting the POI tag
    for x, y, name in zip(xs, ys, names):
        plt.scatter(x,
                    y,
                    s=100,
                    c=get_colour_tag(name.split('_', 1)[1]),
                    label=name.split('_', 1)[1])
        #plt.text(x,y,name.split('_',1)[0])
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    plt.legend(by_label.values(),
               by_label.keys(),
               loc='lower center',
               ncol=4,
               bbox_to_anchor=(0.5, -0.6))

    plt.show()

    # Create a denodrogram
    linkage_matrix = ward(X)

    # match dendrogram to that returned by R's hclust()
    dendrogram(linkage_matrix, orientation="right")
    plt.tight_layout()
    plt.show()

예제 #45

0

파일 보기

파일: iterating_grace.py 프로젝트: teddyroland/smell-test

print 'Explained Variance', pca.explained_variance_ratio_[:2]
# Haven't yet put this info on the axes like biplot() does!

plt.show()

## COSINE SIMILARITY/MDS GRAPH
# biplot modded for MDS with Cosine Similarity

# MDS, naturally
mds = MDS(n_components=2)
new_array = np.concatenate((bag_array, grc_array),
                           axis=0)  # incl. IG vector with suspects'
distances = 1 - cosine_similarity(new_array)

# Scales Cosine Distances into 2-D space
new_mds = mds.fit_transform(new_array)

# how we'll distinguish IG from the suspects
mds_markers = ['x'] * len(labels) + ['o']
mds_labels = labels[:]
mds_labels.append(labels[-1] + 1)

# coordinates of 2-D scaled slice vectors
xs = new_mds[:, 0]
ys = new_mds[:, 1]

# plotting points
plt.figure(1, figsize=(10, 10), dpi=200)
for i in range(len(xs)):
    plt.plot(xs[i],
             ys[i],

예제 #46

0

파일 보기

파일: cluster.py 프로젝트: shayneobrien/conversational-analysis

    def k_means_cluster(self, num_clusters=8):
        """ KMeans Cluster at the document level. Separates samples into n groups of equal variance by
        within-cluster sum-of-squares. The square is taken of the tfidf matrix row entries, then summed 
        for distance to each centroid. Clusters are initialized semi-randomly (inital clusters far from each other) """

        tfidf_vectorizer = TfidfVectorizer(max_df=0.50,
                                           max_features=200000,
                                           min_df=0.02,
                                           stop_words=STOPWORDS,
                                           use_idf=True,
                                           ngram_range=(1, 2))
        tfidf_matrix = tfidf_vectorizer.fit_transform([
            ' . '.join(doc) for doc in self.corpus.tokenized
        ])  #fit the vectorizer to synopses

        dist = 1 - cosine_similarity(tfidf_matrix)
        km = KMeans(n_clusters=num_clusters, max_iter=300, init='k-means++')
        km = km.fit(tfidf_matrix)
        terms = tfidf_vectorizer.get_feature_names()

        print("Top terms per cluster:")
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]

        for cluster in range(num_clusters):
            print("Cluster {0} words: {1}".format(
                cluster + 1, ' | '.join(
                    [terms[ind] for ind in order_centroids[cluster, :3]])))

        mds = MDS(n_components=2, dissimilarity="precomputed",
                  random_state=1)  # PCA is a method of MDS
        positions = mds.fit_transform(dist)

        xs, ys = positions[:, 0], positions[:, 1]

        labels = km.labels_.tolist()
        if self.dirname:
            titles = get_file_list(dirname)
            titles = [label[-8:-4] for label in labels]  # KATHY ONLY: get year
        else:
            titles = ['  ' + str(int(label)) for label in labels]

        df = pd.DataFrame(dict(x=xs, y=ys, label=labels, title=titles))

        #group by cluster
        groups = df.groupby('label')

        # set up plot
        fig, ax = plt.subplots(figsize=(17, 9))  # set size
        ax.margins(0.05)  # Optional, just adds 5% padding to the autoscaling

        #iterate through groups to layer the plot
        for name, group in groups:
            ax.plot(group.x,
                    group.y,
                    marker='o',
                    linestyle='',
                    ms=8,
                    label='Cluster ' + str(name),
                    mec='none')

            ax.set_aspect('auto')

            ax.tick_params(
                axis='x',  # changes apply to the x-axis
                which='both',  # both major and minor ticks are affected
                bottom=False,  # ticks along the bottom edge are off
                top=False,  # ticks along the top edge are off
                labelbottom=False)

            ax.tick_params(
                axis='y',  # changes apply to the y-axis
                which='both',  # both major and minor ticks are affected
                left=False,  # ticks along the bottom edge are off
                top=False,  # ticks along the top edge are off
                labelleft=False)

        ax.legend(numpoints=1)

        #add label in x,y position with the label as the film title
        for i in range(len(df)):
            ax.text(df.iloc[i]['x'],
                    df.iloc[i]['y'],
                    df.iloc[i]['title'],
                    size=8)

        plt.show()

예제 #47

0

파일 보기

파일: matrixOperations.py 프로젝트: neharika279/temporal-ordering-software

def MDSEmbedding(dimension_factor,distance_matrix):
    embedding = MDS(n_components=dimension_factor,dissimilarity='precomputed',metric=True,random_state=42)
    MDS_fix_matrix=embedding.fit_transform(distance_matrix)
    
    return MDS_fix_matrix

예제 #48

0

파일 보기

파일: correlacion_topicos.py 프로젝트: spinto88/Analisis_medios

import cPickle as pk
from sklearn.manifold import MDS
import numpy as np
import matplotlib.pyplot as plt

newspaper = 'lanacion'

number_of_topics = 62

notes = [pk.load(file('Data03-05/{}_topic{}_vect.pk'.format(newspaper, i),'r')) \
                for i in range(number_of_topics)]

diss_matrix = np.array([[np.abs(np.log(notes[i].dot(notes[j]))) \
                    for i in range(number_of_topics)]
                    for j in range(number_of_topics)])

mds = MDS(n_components=2, dissimilarity='precomputed')
x_mds = mds.fit_transform(diss_matrix)

plt.scatter(x_mds[:, 0], x_mds[:, 1], alpha=0.25, s=100)
for i in range(number_of_topics):
    plt.text(x_mds[i, 0], x_mds[i, 1], str(i))
plt.grid('on')
plt.show()

예제 #49

0

파일 보기

    data = []
    lang_data = []
    gender_data = []
    for k in utt_list:
        data.append(utt2data[k])
        lang_data.append(utt2lang[k])
        if args.utt2gender:
            gender_data.append(utt2gender[k])

    data = np.matrix(data)

    lang_data = np.array(lang_data)
    if args.utt2gender:
        gender_data = np.array(gender_data)
    embedding = MDS(n_components=2)
    data_transformed = embedding.fit_transform(data)

    # df = pd.DataFrame(dict(x=data_transformed[:,0], y=data_transformed[:,1], label=lang_data))
    df = pd.DataFrame(
        dict(x=data_transformed[:, 0],
             y=data_transformed[:, 1],
             label=lang_data,
             gender=gender_data))

    if not args.utt2gender:
        groups = df.groupby('label')
    else:
        groups = df.groupby(['label', 'gender'])
    fig, ax = plt.subplots()
    ax.margins(0.05)  # Optional, just adds 5% padding to the autoscaling

예제 #50

0

파일 보기

파일: main_select_best_representative_sub.py 프로젝트: ajoshiusc/brainsync

        dist_all_rot[ind1, ind2] = sp.linalg.norm(sub_data[:, :, ind1] -
                                                  sub_data_rot)
        print(ind1, ind2)

sp.savez('ADHD_pairwise_dist.npz',
         dist_all_rot=dist_all_rot,
         dist_all_orig=dist_all_orig,
         normSub=normSub)
######
#%%
a = sp.load('ADHD_pairwise_dist.npz')
lst = a['lst']
q = sp.argmin(a['dist_all_rot'][:-1, :-1].sum(1))
print('The representative subject is: %s ' % lst[q])
m = MDS(n_components=2, dissimilarity='precomputed')
e = m.fit_transform(a['dist_all_rot'])
print(e)
fig, ax = plt.subplots()
ax.scatter(e[:, 0], e[:, 1])
for i in range(e.shape[0]):
    ax.annotate(lst[i], (e[i, 0], e[i, 1]))

#%% Compute difference
diff = 0
q = 3
for ind in range(15):
    Y2, _ = brainSync(X=sub_data[:, :, q], Y=sub_data[:, :, ind])
    diff += (Y2 - sub_data[:, :, q])**2
    print(ind, end=',')

spio.savemat('ADHD_norm_diff2sub1.mat', {'diff': diff})

예제 #51

0

파일 보기

파일: Chapter8Exercise10.py 프로젝트: NaudeConradie/ADA874

plt.axis('off')
plt.title("LLE MNIST Scatter Plot", fontsize = 14)
save_fig("LLE MNIST Scatter Plot")
plt.show()


# In[27]:


mds = MDS(n_components=2, random_state=42)


# In[28]:


x_test_redux_mds = mds.fit_transform(x_test[:1000])


# In[29]:


plt.figure(figsize = (12, 10))
plt.scatter(x_test_redux_mds[:, 0], x_test_redux_mds[:, 1], c = y_test[:1000], cmap = "gist_rainbow")
plt.colorbar()
plt.axis('off')
plt.title("MDS MNIST Scatter Plot", fontsize = 14)
save_fig("MDS MNIST Scatter Plot")
plt.show()


# In[ ]:

예제 #52

0

파일 보기

class xRepresentation:
    """
    Perform dimensionality reduction and represent 2d matrices 
    """
    def __init__(self, **kwargs):
        """
        input parameters
        """
        self.__n_clusters = kwargs.get('n_clusters', 2)
        self.__pca_components = kwargs.get('pca_components', 3)
        self.__tsne_components = kwargs.get('tsne_components', 3)
        self.__components = kwargs.get('components', 2)
        self.__neighbors = kwargs.get('neighbors', 2)  #10
        self.__lsa_normalization = False

        #sparse matrix
        self.__matrix = None

        #tf-idf dataframe
        self.__tfidf_dataframe = pd.DataFrame()
        self.__corpora_names = None
        self.__feature_names = None

        #
        self.__clusters = xClusters(n_clusters=self.__n_clusters)

        #angular similarity
        self.__tfidf_angle_similarity_dataframe = pd.DataFrame()

        #pca
        self.__pca = PCA(n_components=self.__pca_components)

        #svd/lsa
        self.__svd = TruncatedSVD(n_components=self.__components,
                                  n_iter=7,
                                  random_state=33)

        #t-sne
        self.__tsne = TSNE(n_components=self.__components)

        #MDS
        self.__mds = MDS(n_components=self.__components, random_state=33)

        # manifold isomap for non-linear dimension reduction
        self.__isomap = Isomap(n_components=self.__components,
                               n_neighbors=self.__neighbors,
                               eigen_solver='auto')

        #cluster centers
        self.__kmeans_cluster_centers = None

        #normalizer
        self.__normalizer = Normalizer(copy=False)

    @property
    def clusters(self):
        """
        returns an xCluster class object
        """
        return self.__clusters

    @property
    def matrix(self) -> np.matrix:
        return self.__matrix

    @matrix.setter
    def matrix(self, matrix: csr_matrix):
        if matrix.size:
            self.__matrix = matrix.todense()

    @property
    def tfidf_dataframe(self) -> pd.DataFrame():
        return self.__tfidf_dataframe

    @tfidf_dataframe.setter
    def tfidf_dataframe(self, df: pd.DataFrame() = None):
        self.__tfidf_dataframe = df

    @property
    def corpora_names(self) -> List[str]:
        return self.__corpora_names

    @corpora_names.setter
    def corpora_names(self, names=None):
        if names:
            self.__corpora_names = names

    @property
    def feature_names(self) -> List[str]:
        return self.__feature_names

    @feature_names.setter
    def feature_names(self, names=None):
        if names:
            self.__feature_names = names

    @property
    def tfidf_angle_similarity_dataframe(self):
        return self.__tfidf_angle_similarity_dataframe

    @tfidf_angle_similarity_dataframe.setter
    def tfidf_angle_similarity_dataframe(self, df: pd.DataFrame() = None):
        self.__tfidf_angle_similarity_dataframe = df

    def __normalize_matrix(self):
        transformer = self.__normalizer.fit(self.__matrix)  # fit does nothing
        self.__matrix = transformer.transform(self.__matrix)

    def __fit_pca(self):

        #Fit the model with X.
        pca = self.__pca.fit(X=self.__matrix)

        #Apply dimensionality reduction to X.
        data2d = pca.transform(X=self.__matrix)

        #pca shape
        print("PCA data shape", data2d.shape)

        #calculate the cluster enters on the reduced data
        cluster_centers2d = pca.transform(
            self.__clusters.kmeans_cluster_centers)

        #number of components
        n_components = self.__pca.components_.shape[0]

        print("Number of PCA components", n_components)

        #cross-check
        if n_components != data2d.shape[1]:
            print(f'inconcistent PCA companents {n_components} '
                  f'and 2d data row colunmns {data2d.shape[1]}')
            return False

        y = self.__clusters.kmeans_clusters_pred
        pairs = list(range(0, n_components))
        for i, j in zip(pairs, pairs[1:] + pairs[:1]):
            print("PCA components", i, j)

            #plot
            fig = plt.figure(figsize=(10, 7))
            ax = fig.add_subplot(111)
            ax.set_title('PCA')

            #plt.scatter(data2d[:, i],
            #            data2d[:, j],
            #            c = self.__kmeans_clusters_pred)

            for icluster in range(self.__clusters.n_actual_clusters):
                print(
                    f"Cluster {icluster}/{self.__clusters.n_actual_clusters}")
                ax.scatter(
                    np.array(data2d[y == icluster, i]),
                    np.array(data2d[y == icluster, j]),
                    s=100,
                    #c = i,
                    alpha=0.5,
                    label=f"Cluster {icluster}")

            plt.scatter(cluster_centers2d[:, i],
                        cluster_centers2d[:, j],
                        marker='x',
                        s=200,
                        linewidths=3,
                        c='r',
                        label='Centroids')

            ax.set_xlabel(f"component {i}")
            ax.set_ylabel(f"component {j}")
            ax.legend(loc="best")
            plt.show()

    def __fit_lsa(self):
        #LSA/SVD results are not normalized
        #Normalization might be needed
        if self.__lsa_normalization:
            lsa = make_pipeline(self.__svd, self.__normalizer)
            lsa_data2d = lsa.fit_transform(X=self.__matrix)

        else:
            #Fit the model with X.
            lsa = self.__svd.fit(X=self.__matrix)

            #Apply dimensionality reduction to X.
            lsa_data2d = lsa.transform(X=self.__matrix)

        #calculate the cluster enters on the reduced data
        lsa_cluster_centers2d = lsa.transform(
            self.__kmeans_clusters.cluster_centers_)

        #plot
        plt.scatter(lsa_data2d[:, 0],
                    lsa_data2d[:, 1],
                    c=self.__kmeans_clusters_pred
                    )  #kmeans_clusters_pred, kmeans_labels

        plt.scatter(lsa_cluster_centers2d[:, 0],
                    lsa_cluster_centers2d[:, 1],
                    marker='x',
                    s=200,
                    linewidths=3,
                    c='r')

        plt.show()

    def __fit_tsne(self):
        #Fit X into an embedded space.
        #tsne = self.__tsne.fit(X=self.__matrix)
        #transform
        #data2d = self.__tsne.transform(X=self.__matrix)

        #Fit X into an embedded space and return that transformed output.
        #Output: Embedding of the training data in low-dimensional space.
        data2d = self.__tsne.fit_transform(X=self.__matrix)

        #embedding shape
        print("TSNE embedding shape", data2d.shape,
              self.__tsne.embedding_.shape)

        #calculate the cluster enters on the reduced data
        #cluster_centers2d = tsne.transform(self.__clusters.kmeans_cluster_centers)

        #array-like, shape (n_samples, n_components)
        n_components = self.__tsne.embedding_.shape[1]
        print("Number of TSNE components", n_components)

        y = self.__clusters.kmeans_clusters_pred
        pairs = list(range(0, n_components))
        print(pairs)
        print(pairs[1:] + pairs[:1])
        for i, j in zip(pairs, pairs[1:] + pairs[:1]):
            print("TSNE components", i, j)

            #plot
            fig = plt.figure(figsize=(10, 7))
            ax = fig.add_subplot(111)
            ax.set_title('TSNE')

            for icluster in range(self.__clusters.n_actual_clusters):
                print(
                    f"Cluster {icluster}/{self.__clusters.n_actual_clusters}")
                ax.scatter(
                    np.array(data2d[y == icluster, i]),
                    np.array(data2d[y == icluster, j]),
                    s=100,
                    #c = i,
                    alpha=0.5,
                    label=f"Cluster {icluster}")

            #plt.scatter(cluster_centers2d[:, i],
            #            cluster_centers2d[:, j],
            #            marker='x',
            #            s=200,
            #            linewidths=3,
            #            c='r',
            #            label='Centroids')

            ax.set_xlabel(f"component {i}")
            ax.set_ylabel(f"component {j}")
            ax.legend(loc="best")
            plt.show()

    def __fit_mds(self):
        #Fit X data
        mds_data2d = self.__mds.fit_transform(X=self.__matrix)

        #plot
        plt.scatter(mds_data2d[:, 0],
                    mds_data2d[:, 1],
                    c=self.__kmeans_clusters_pred,
                    cmap=plt.cm.Spectral)
        plt.show()

    def __fit_isomap(self):
        #Compute the embedding vectors for data X
        embed = self.__isomap.fit_transform(X=self.__matrix)

        #Semantic labeling of cluster.
        #Apply a label if the clusters max TF-IDF is in the 90% quantile
        #of the whole corpus of TF-IDF scores

        #clusterLabels = []
        #t99 = scipy.stats.mstats.mquantiles(self.__matrix, [ 0.9])[0]

        #for i in range(0, vectorized.shape[0]):
        #    row = vectorized.getrow(i)

        #    if row.max() >= t99:
        #        arrayIndex = numpy.where(row.data == row.max())[0][0]
        #        clusterLabels.append(labels[row.indices[arrayIndex]])
        #    else:
        #        clusterLabels.append('')

        # Plot the dimension reduced data

        plt.xlabel('reduced dimension-1')
        plt.ylabel('reduced dimension-2')
        for i in range(1, len(embed)):
            plt.scatter(embed[i][0], embed[i][1])
            #c=self.__kmeans_clusters_pred)
            #plt.annotate(clusterLabels[i],
            #             embed[i],
            #             xytext=None, xycoords='data', textcoords='data', arrowprops=None)

        plt.show()

    def __display_tfidf(self):

        print('TF-IDF matrix')
        print(self.__tfidf_dataframe)

        #aggregations and ascending order
        estimates = {'mean': False, 'sum': False, 'max': False}
        #print(list(estimates.keys()))
        #print(estimates.values())

        #df = self.__tfidf_dataframe.mean(axis=0)
        df = self.__tfidf_dataframe.agg(list(estimates.keys())).T

        df = df.sort_values(by=list(estimates.keys()),
                            ascending=list(estimates.values()))
        df.index.name = 'term'

        title = 'tf-idf'
        N = 20
        if N > 0:
            df = df.head(N)  # same as df[:N]
            df = df.tail(N)  # same as df[-N:]
            title += f' ({N} top terms)'
        #plt.figure(figsize=(10, 6))
        ax = df[estimates.keys()].plot(kind='bar',
                                       title=title,
                                       figsize=(15, 10),
                                       legend=True,
                                       fontsize=12)
        ax.set_xlabel("term", fontsize=12)
        ax.set_ylabel("score", fontsize=12)
        plt.xticks(rotation=30, ha='right')
        plt.show()

        #store in txt file
        #print(df);

    def __display_angle_similarity(self):
        df = self.__tfidf_angle_similarity_dataframe
        mask = np.zeros_like(df, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        vmin = df.where(df > 0).min().min()
        vmax = df.max().max()  #df.values.max()
        midpoint = (vmax - vmin) / 2
        #smoother washed-out contours
        alpha = 0.10
        vmin_new = vmin * (1 - alpha)
        vmax_new = vmax * (1 + alpha)
        vmin = vmin_new
        vmax = vmax_new if vmax_new < 90. else 90
        print(f"Angle similarity min {vmin}, max {vmax}")
        fig = plt.figure(figsize=(10, 7))
        ax = fig.add_subplot(111)
        ax.set_title(f'TF-IDF document angle similarity)')
        sns.heatmap(
            df,
            mask=mask,
            annot=True,
            square=True,  #forces the aspect ratio of the blocks to be equal
            fmt=".1f",
            vmin=vmin,
            vmax=vmax,
            #center=midpoint,
            cmap="coolwarm",
            annot_kws={'size': 10},
            ax=ax)

        plt.show()

    def fit(self):
        """
        user callable method to invoke fits
        """
        self.__display_tfidf()
        self.__display_angle_similarity()

        print(type(self.__feature_names))
        self.__clusters.fit(matrix=self.__matrix,
                            dataframe=self.__tfidf_dataframe,
                            corpora=self.__corpora_names,
                            features=self.__feature_names)

        self.__fit_pca()
        #self.__fit_lsa()
        self.__fit_tsne()

예제 #53

0

파일 보기

    def run(self):
        with self.input()[0] as i:
            graphIndex, GR = i.query()

        with self.input()[1] as i:
            R = i.query()

        dis = np.ones(GR.shape, dtype=GR.dtype) - GR
        colors = ['grey', 'green', 'red']
        tName = ['UNKNOWN', 'IUV', 'ESBMC']
        aName = ['UNKNOWN', 'Tester', 'Verificator']

        lScore = np.zeros(len(GR))
        lTime = np.zeros(len(GR))
        for index, D in R.items():
            if index not in graphIndex:
                continue
            gI = graphIndex[index]
            score = self.__evalScore(D['score'])
            time = self.__evalTime(D['time_rank'])
            for i, t in enumerate(tName):
                if score == t:
                    lScore[gI] = i
                if time == t:
                    lTime[gI] = i

        mds = MDS(n_components=2, dissimilarity="precomputed", n_init=10)
        X_r = mds.fit_transform(dis)
        stress = mds.stress_

        plt.figure(1)
        plt.suptitle(
            'MDS of GRAM dataset (h: %s, D: %s) [%s points] (Stress: %2.2f)' %
            (str(self.h), str(self.D), str(len(X_r)), stress))

        plt.subplot(121)
        for color, i, t in zip(colors, range(len(aName)), aName):
            plt.scatter(X_r[lScore == i, 0],
                        X_r[lScore == i, 1],
                        color=color,
                        alpha=.8,
                        lw=2,
                        label=t)
        plt.legend(loc='best', shadow=False, scatterpoints=1)

        plt.subplot(122)
        for color, i, t in zip(colors, range(len(aName)), aName):
            plt.scatter(X_r[lTime == i, 0],
                        X_r[lTime == i, 1],
                        color=color,
                        alpha=.8,
                        lw=2,
                        label=t)
        plt.legend(loc='best', shadow=False, scatterpoints=1)

        path = self.output().path

        directory = dirname(path)

        if not os.path.exists(directory):
            os.makedirs(directory)

        plt.savefig(path)
        plt.close()

예제 #54

0

파일 보기

ax.set_xticks(range(len(classes)))
ax.set_xticklabels(labels, rotation=40, ha='left')
ax.axhline(11.5, color='k')
ax.axvline(11.5, color='k')
plt.colorbar(im)
plt.tight_layout()
plt.show()

##############################################################################
# Confusion matrix related to mental representations have been historically
# summarized with dimensionality reduction using multi-dimensional scaling [1].
# See how the face samples cluster together.
fig, ax = plt.subplots(1)
mds = MDS(2, random_state=0, dissimilarity='precomputed')
chance = 0.5
summary = mds.fit_transform(chance - confusion)
cmap = plt.get_cmap('rainbow')
colors = ['r', 'b']
names = list(conds['condition'].values)
for color, name in zip(colors, set(names)):
    sel = np.where([this_name == name for this_name in names])[0]
    size = 500 if name == 'human face' else 100
    ax.scatter(summary[sel, 0], summary[sel, 1], s=size,
               facecolors=color, label=name, edgecolors='k')
ax.axis('off')
ax.legend(loc='lower right', scatterpoints=1, ncol=2)
plt.tight_layout()
plt.show()

##############################################################################
# References

예제 #55

0

파일 보기

파일: server.py 프로젝트: warrieraravind/dataVisualisation_python

def calculate_mds(data, type):
    distance_matrix = SK_Metrics.pairwise_distances(data, metric=type)
    mds = MDS(n_components=2, dissimilarity='precomputed')
    return mds.fit_transform(distance_matrix)

예제 #56

0

파일 보기

# In[201]:

pivot_scaled = 1 / pivot

pivot_scaled = pivot_scaled.replace(np.nan, 0)
pivot_scaled.to_csv('lift_value_competitors.csv')

# In[203]:

from sklearn.manifold import MDS
import matplotlib.pyplot as plt

# In[204]:
mds = MDS(2, random_state=0, dissimilarity='precomputed')
lift_2D = mds.fit_transform(pivot_scaled)

plt.rcParams['figure.figsize'] = [8, 8]
plt.rc('font', size=17, weight='bold')

x_list = []
y_list = []
label_list = []
fig = plt.figure(figsize=(12, 10))
for i in np.unique(pivot_scaled.columns):
    subset = lift_2D[pivot_scaled.columns == i]

    x = [row[0] for row in subset]
    y = [row[1] for row in subset]

    plt.scatter(x, y, s=300)

예제 #57

0

파일 보기

파일: stories.py 프로젝트: jku-vds-lab/sensemakingspace

    def project(self,
                weights={},
                alpha=.5,
                verbose=False,
                delete_duplicates=False,
                method='tsne',
                implementation='openTSNE',
                condense_countries=False,
                **kwargs):
        w_dict = {
            'year': 1.,
            'x': 1.,
            'y': 1.,
            'size': 1.,
            'color': 1.,
            'countries': 1.
        }
        for w in weights:
            w_dict[w] = weights[w]
        weights = np.array(list(w_dict.values()), dtype=np.float32)
        year_span = self.year_span()
        num_countries = len(self.countries())

        if not condense_countries:

            @jit(nopython=True)
            def state_distance(a, b):
                a = a.astype(np.float32)
                b = b.astype(np.float32)
                if year_span == 0:
                    year_dist = 0
                else:
                    year_dist = np.abs(a[0] - b[0]) / year_span
                x_dist = 1 - np.dot(a[1:6], b[1:6])
                y_dist = 1 - np.dot(a[6:11], b[6:11])
                size_dist = 1 - np.dot(a[11:16], b[11:16])
                color_dist = 1 - np.dot(a[16:18], b[16:18])
                if num_countries == 0:
                    country_dist = 0
                else:
                    country_dist = np.linalg.norm(a[18:] - b[18:]) / np.sqrt(
                        num_countries)

                dists = np.array([
                    year_dist, x_dist, y_dist, size_dist, color_dist,
                    country_dist
                ],
                                 dtype=np.float32)

                return (dists * weights).sum()
        else:

            @jit(nopython=True)
            def state_distance(a, b):
                a = a.astype(np.float32)
                b = b.astype(np.float32)
                if year_span == 0:
                    year_dist = 0
                else:
                    year_dist = np.abs(a[0] - b[0]) / year_span
                x_dist = 1 - np.dot(a[1:6], b[1:6])
                y_dist = 1 - np.dot(a[6:11], b[6:11])
                size_dist = 1 - np.dot(a[11:16], b[11:16])
                color_dist = 1 - np.dot(a[16:18], b[16:18])
                if num_countries == 0:
                    country_dist = 0
                else:
                    lat1, long1 = a[18:20]
                    lat2, long2 = b[18:20]
                    lat1 = lat1 / 180. * 2 * np.pi
                    lat2 = lat2 / 180. * 2 * np.pi
                    delta_long = long1 - long2
                    gcdist = np.sin(lat1) * np.sin(lat2)
                    gcdist += np.cos(lat1) * np.cos(lat2) * np.cos(delta_long)
                    gcdist = np.arccos(gcdist) / np.pi

                    spread_dist = np.abs(a[20] - b[20])

                    sel_dist = np.abs(a[21] - b[21]) / num_countries

                    country_dist = (gcdist + spread_dist + sel_dist)

                dists = np.array([
                    year_dist, x_dist, y_dist, size_dist, color_dist,
                    country_dist
                ],
                                 dtype=np.float32)

                return (dists * weights).sum()

        if delete_duplicates:
            # check if any weight is 0.
            encoded = self.encode()
            if w_dict['year'] == 0.:
                encoded[:, 0] = 0.
            if w_dict['x'] == 0.:
                encoded[:, 1:6] = 0.
            if w_dict['y'] == 0.:
                encoded[:, 6:11] = 0.
            if w_dict['size'] == 0.:
                encoded[:, 11:16] = 0.
            if w_dict['color'] == 0.:
                encoded[:, 16:18] = 0.
            if w_dict['countries'] == 0.:
                encoded[:, 18:] = 0.
            encoded, indices, counts = np.unique(encoded,
                                                 axis=0,
                                                 return_inverse=True,
                                                 return_counts=True)
            self.counts = counts[indices]
        else:
            encoded = self.encode()

        if method == 'tsne':
            if implementation == 'openTSNE':
                if verbose:
                    tsne = openTSNE(metric=state_distance,
                                    verbose=verbose,
                                    n_jobs=-1,
                                    **kwargs)
                else:
                    tsne = openTSNE(metric=state_distance, n_jobs=-1, **kwargs)
                embedding = np.array(tsne.fit(encoded))
            elif implementation == 'sklearn':
                tsne = sklearnTSNE(metric=state_distance,
                                   verbose=3 if verbose else 0)
                embedding = np.array(tsne.fit_transform(encoded))
        elif method == 'mds':
            mds = MDS(n_components=2, metric=True, dissimilarity='precomputed')
            distmat = squareform(pdist(encoded, state_distance))
            embedding = mds.fit_transform(distmat)
        elif method == 'umap':
            umap = UMAP(metric=state_distance, verbose=verbose, **kwargs)
            embedding = np.array(umap.fit_transform(encoded))
        elif method == 'hybrid':
            if not delete_duplicates:
                raise Warning('Hybrid layout always deletes duplicates!')

            adj = nx.adj_matrix(self.make_graph()).toarray()

            # adjacency matrix of undirected multigraph
            intermediate = np.zeros_like(adj, dtype=np.int)
            for (i, j), item in np.ndenumerate(adj):
                intermediate[i, j] += item
                intermediate[j, i] += item

            # use inverse number of connections as weights
            edges = []
            edge_weights = []
            for (i, j), item in np.ndenumerate(intermediate):
                if item != 0 and i <= j:
                    edges.append((i, j))
                    edge_weights.append(item)

            # construct weighted graph and calculate path lengths
            g = nx.Graph()
            for e, w in zip(edges, edge_weights):
                g.add_edge(*e, weight=1 / w)
            path_lengths = dict(nx.all_pairs_dijkstra_path_length(g))

            # construct distmat from path_lengths
            graph_distmat = np.zeros_like(adj, dtype=np.float)
            graph_distmat -= np.inf
            for i in path_lengths:
                dists = path_lengths[i]
                for j in dists:
                    graph_distmat[i, j] = dists[j]

            graph_distmat = graph_distmat / graph_distmat.max()
            graph_distmat[graph_distmat == -np.inf] = 2.

            attr_distmat = squareform(
                pdist(np.unique(self.encode().astype(np.float32), axis=0),
                      metric=state_distance))

            init = 'random'
            if hasattr(alpha, '__iter__'):
                self.hybrid_alphas = alpha
                embedding = []
                for a in alpha:
                    distmat = (1 - a) * graph_distmat + a * attr_distmat
                    tsne = openTSNE(metric='precomputed', initialization=init)
                    embedding.append(tsne.fit(distmat))
            else:
                distmat = (1 - alpha) * graph_distmat + alpha * attr_distmat
                tsne = openTSNE(metric='precomputed', initialization=init)
                embedding = tsne.fit(distmat)
                init = embedding[-1]

        if delete_duplicates:
            if method == 'hybrid' and hasattr(alpha, '__iter__'):
                embedding = np.stack([e[indices] for e in embedding])
            else:
                embedding = embedding[indices]

        indices = np.add.accumulate(self.lengths())

        if method == 'hybrid' and hasattr(alpha, '__iter__'):
            self.embedding = [
                np.array_split(e, indices)[:-1] for e in embedding
            ]
        else:
            self.embedding = np.array_split(embedding, indices)[:-1]

        self.projected = True

예제 #58

0

파일 보기

#!/usr/bin/env python
# coding: utf-8

import numpy as np
import scipy.spatial.distance
from sklearn.manifold import MDS
from scipy.spatial.distance import pdist, squareform

import matplotlib.pyplot as plt

np.random.seed(seed=668)

data = np.random.rand(1000, 100)

D = pdist(data, metric='euclidean')
D = squareform(D)

mds = MDS(n_components=2, dissimilarity='precomputed')
fit_t = mds.fit_transform(D)

fig = plt.figure()
ax = fig.add_subplot(111)

ax.set_title('MDS of random data')
ax.set_xlabel('PC-1')
ax.set_ylabel('PC-2')
ax.scatter(x=fit_t[:, 0], y=fit_t[:, 1], c='r')
plt.show()

예제 #59

0

파일 보기

plt.plot(feature_vector1, feature_vector2, 'ro')
plt.show()


#Multidimentional Scaling
def findhamdist(str1, str2):
    diffs = 0
    for k in xrange(len(str1)):
        if str1[k] != str2[k]:
            diffs += 1
    return diffs


rowscolumn = 62
MDS_Matrix = np.zeros((62, 62))

for i in xrange(rowscolumn):
    for j in xrange(rowscolumn):
        MDS_Matrix[i][j] = findhamdist(X_Matrix[i, :], X_Matrix[j, :])

print(MDS_Matrix)

model = MDS(n_components=2, dissimilarity='precomputed', random_state=6)
out = model.fit_transform(MDS_Matrix)
print(out)
plt.title('MultiDimentional Scaling')
plt.xlabel('feature_vector1')
plt.ylabel('feature_vector2')
plt.scatter(out[:, 0], out[:, 1])
plt.show()

예제 #60

0

파일 보기

파일: kmeans.py 프로젝트: LIMON100/Machine-Learning

def main():

    #data = pd.read_csv(os.path.join(path, 'data\headlines_cleaned.txt'), names=['text'])
    data = pd.read_csv(
        'C:/Users/Mahmudur Limon/Downloads/data/jupiter Code/preprocessed data.csv',
        engine='python')
    data = data[['Unnamed: 0', 'title', 'text']]
    data = data.rename(index=str, columns={'Unnamed: 0': 'id'})

    # text data in dataframe and removing stops words
    stop_words = set(stopwords.words('english'))
    #data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

    # Using TFIDF vectorizer to convert convert words to Vector Space
    tfidf_vectorizer = TfidfVectorizer(max_features=200000,
                                       use_idf=True,
                                       stop_words='english',
                                       tokenizer=tokenize_and_stem)

    # Fit the vectorizer to text data
    tfidf_matrix = tfidf_vectorizer.fit_transform(data['text'])
    terms = tfidf_vectorizer.get_feature_names()
    # print(terms)

    # Kmeans++
    km = KMeans(n_clusters=7,
                init='k-means++',
                max_iter=300,
                n_init=1,
                verbose=0,
                random_state=3425)
    km.fit(tfidf_matrix)
    labels = km.labels_
    clusters = labels.tolist()

    # Calculating the distance measure derived from cosine similarity
    distance = 1 - cosine_similarity(tfidf_matrix)

    # Dimensionality reduction using Multidimensional scaling (MDS)
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    pos = mds.fit_transform(distance)
    xs, ys = pos[:, 0], pos[:, 1]

    # Saving cluster visualization after mutidimensional scaling
    for x, y, in zip(xs, ys):
        plt.scatter(x, y)
    plt.title('MDS output of News Headlines')
    plt.savefig(os.path.join(path, 'results\MDS.png'))

    # Creating dataframe containing reduced dimensions, identified labels and text data for plotting KMeans output
    df = pd.DataFrame(dict(label=clusters, data=data['text'], x=xs, y=ys))
    df.to_csv(os.path.join(path, 'results\kmeans_clustered_DF.txt'), sep=',')

    label_color_map = {
        0: 'red',
        1: 'blue',
        2: 'green',
        3: 'pink',
        4: 'purple',
        5: 'yellow',
        6: 'orange',
        7: 'grey'
    }

    csv = open(os.path.join(path, 'results\kmeans_clustered_output.txt'), 'w')
    csv.write('Cluster     Headline\n')

    fig, ax = plt.subplots(figsize=(17, 9))

    for index, row in df.iterrows():
        cluster = row['label']
        label_color = label_color_map[row['label']]
        label_text = row['data']
        ax.plot(row['x'], row['y'], marker='o', ms=12, c=label_color)
        row = str(cluster) + ',' + label_text + '\n'
        csv.write(row)

    # ax.legend(numpoints=1)
    for i in range(len(df)):
        ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['label'], size=8)

    plt.title('News Headlines using KMeans Clustering')
    plt.savefig(os.path.join(path, 'results\kmeans.png'))