예제 #1
0
    def embed(self, DM):
        """Embed a distance matrix using MDS.

        Parameters
        ----------
        M : :obj:`ndarray`
            The distance matrix to be embedded

        Returns
        -------
        :obj:`ndarray`
            A :obj:`ndarray` of the embedding.

        """
        mds = MDS(n_components=self.num_components,
                  dissimilarity="precomputed")
        mds.fit(DM.getMatrix())
        emb = mds.embedding_
        emb = pd.DataFrame(emb)
        emb.index = DM.D.index
        emb.index.name = DM.D.index.name
        name = DM.DS.name + " " + \
               DM.metric_name + " " + \
               self.embedding_name
        EDS = lds.DataSet(emb, name)
        return EDS
예제 #2
0
파일: mds_plot.py 프로젝트: RedHenLab/CDI
def plot_cities():
    #distance_matrix = get_distances()
    cities = 'BOS     CHI     DC      DEN     LA      MIA     NY      SEA     SF'.split()
    distance_matrix = np.array([
        [0   , 963 , 429 , 1949, 2979, 1504, 206 , 2976, 3095],
        [963 , 0   , 671 , 996 , 2054, 1329, 802 , 2013, 2142],
        [429 , 671 , 0   , 1616, 2631, 1075, 233 , 2684, 2799],
        [1949, 996 , 1616, 0   , 1059, 2037, 1771, 1307, 1235],
        [2979, 2054, 2631, 1059, 0   , 2687, 2786, 1131, 379],
        [1504, 1329, 1075, 2037, 2687, 0   , 1308, 3273, 3053],
        [206 , 802 , 233 , 1771, 2786, 1308, 0   , 2815, 2934],
        [2976, 2013, 2684, 1307, 1131, 3273, 2815, 0   , 808],
        [3095, 2142, 2799, 1235, 379 , 3053, 2934, 808 , 0]
        ])

    # assert symmetric
    for (i, j) in [(i, j) for i in range(0, 8) for j in range(0, 8)]:
        try:
            assert(distance_matrix[i][j] == distance_matrix[j][i])
        except AssertionError:
            print((i, j))

    print(distance_matrix)
    mds = MDS(dissimilarity='precomputed')
    mds.fit(distance_matrix)
    print(mds.embedding_)
    for idx, points in enumerate(mds.embedding_):
        plt.plot(points[0], points[1], 'r.')
        plt.text(points[0], points[1], cities[idx])
    plt.show()
    return
예제 #3
0
파일: visuals.py 프로젝트: onpoeet/textnet
def timeline_scatter_plot(X,
                          time_index,
                          method='MDS',
                          metric='cosine',
                          **kwargs):
    if not isinstance(time_index, pd.DatetimeIndex):
        time_index = pd.DatetimeIndex(time_index)
    dm = pairwise_distances(X, metric=metric)
    if method.upper() == 'MDS':
        decomposer = MDS(n_components=2,
                         dissimilarity='precomputed',
                         verbose=1,
                         **kwargs)
        decomposer.fit(dm)
    elif method.upper() == 'TSNE':
        decomposer = TSNE(n_components=2,
                          metric='precomputed',
                          verbose=1,
                          **kwargs)
        decomposer.fit(dm)
    else:
        raise ValueError("Method %s is not supported..." % method)
    X, Y = decomposer.embedding_[:, 0], decomposer.embedding_[:, 1]
    unique_index = time_index.unique().order()
    colormap = {
        time_stamp: color
        for time_stamp, color in zip(
            unique_index, sns.cubehelix_palette(unique_index.shape[0]))
    }
    colors = [colormap[time_stamp] for time_stamp in time_index]
    sns.plt.scatter(X, Y, s=40, color=colors, alpha=0.7)
    sns.plt.axis('off')
예제 #4
0
def perform_MDS_analysis(n_samples=10e10,
                         n_variables=10000,
                         data_type='psi',
                         filter_tissues=True,
                         n_dimensions=2,
                         metric=True):
    """ Performs the MDS of the PSI/TPM values. It theoretically tries to capture the varibility of the data in non-linear ways"""
    data, labels = read_psi_and_recover_tissue(n_samples=n_samples,
                                               n_variables=10000,
                                               data_type=data_type,
                                               filter_tissues=filter_tissues)
    X_train, y_train = generate_sets(data, labels, do_not_split=True)
    mds = MDS(n_components=n_dimensions,
              metric=metric,
              n_init=2,
              max_iter=1000,
              verbose=1,
              eps=0.0001,
              n_jobs=3,
              random_state=None,
              dissimilarity='euclidean')
    mds.fit(X_train.values)
    results = mds.embedding_
    results = pandas.DataFrame(
        results,
        columns=[str(x) + 'D' for x in range(1, n_dimensions + 1)],
        index=y_train.index)
    results = pandas.concat([results, y_train.idxmax(1)], axis=1)
    results = results.rename(columns={0: 'Tissue'})
    plot_by_group(results.groupby('Tissue'), '1D', '2D', kind_of_summary='MDS')
예제 #5
0
def mds_and_plot(model):

    data = DataSet()
    x, y, data_list = data.get_test_frames('train')

    custom_model = Model(inputs=model.input,
                         outputs=model.get_layer('dense_1').output)
    y_pred = custom_model.predict(x)
    mds = MDS()
    mds.fit(y_pred)
    a = mds.embedding_

    mark = ['or', 'ob', 'og', 'oy', 'ok', '+r', 'sr', 'dr', '<r', 'pr']
    color = 0
    j = 0
    for item in y:
        index = 0
        for i in item:
            if i == 1:
                break
            index = index + 1

        plt.plot([a[j:j + 1, 0]], [a[j:j + 1, 1]], mark[index], markersize=5)
        print(index)
        j += 1
    plt.show()
예제 #6
0
def plot_diseases_or_countries_3d(years=[2000],axis='disease',method='mds',outname='d_clusters_by_c_pattern_mds',data_pd=fdata_pd):
# axis is 'disease' or 'country'
# years is subset range 1990-2016
# method is 'pca' or 'mds'
    scaler = StandardScaler()
    
    if axis=='disease':
        year_slices = [scaler.fit_transform(data_pd.loc[(fdata_pd['year'].isin([year])),lambda s: s.columns[2:]].T) for year in years]
    elif axis=='country':
        year_slices = [scaler.fit_transform(data_pd.loc[(fdata_pd['year'].isin([year])),lambda s: s.columns[2:]]) for year in years]
    
    if method=='mds':
        red = MDS(n_components=3)
    elif method=='pca':
        red = PCA(n_components=3)
        
    # fit with full data
    all_year_slices = np.concatenate([year_slices[i] for i in range(len(year_slices))],axis=0)
    red.fit(all_year_slices)
    
    # transform individuals... could use above, not most efficient, can fix if time is issue.
    year_slices = [red.fit_transform(item) for item in year_slices]
    
    traces = [];
    for row in year_slices:
        traces.append([Scatter3d(x=year[:,0],y=year[:,1],z=year[:,2],mode='markers') for year in year_slices])
    
    data = Data(traces)
    iplot(data, filename = outname)
예제 #7
0
class Embedder(object):
    def __init__(self, method_name, *args, **kwargs):
        self.projector = None
        self.method_name = method_name
        if method_name == "tsne":
            self.projector = TSNE(*args, **kwargs)
        elif method_name == "pca":
            self.projector = PCA(*args, **kwargs)
        elif method_name == "mds":
            self.projector = MDS(n_jobs=-1, *args, **kwargs)
        else:
            logger.error("the projection method is not supported now!!")

    def fit(self, X, y):
        t = time()
        self.projector.fit(X, y)
        logger.info("{} fit function time cost: {}".format(self.method_name, time()-t))

    def transform(self, X, y):
        t = time()
        self.projector.transform(X, y)
        logger.info("{} transform function time cost: {}".format(self.method_name, time()-t))

    def fit_transform(self, X, y):
        t = time()
        res = self.projector.fit_transform(X, y)
        logger.info("{} fit_transform function time cost: {}".format(self.method_name, time()-t))
        return res
예제 #8
0
def non_param_multi_dim_scaling(dists, n_dims=3, n_threads=None, metric=True):
    mds = MDS(n_components=n_dims, metric=metric, n_jobs=n_threads,
              dissimilarity='precomputed')
    mds.fit(squareform(dists))
    projs = mds.embedding_
    res = {'stress': mds.stress_,
           'projections': projs}
    return res
예제 #9
0
 def use_mds(self):
     obj = MDS(self.n_components)
     obj.fit(self.data)
     print(obj.fit(self.data))
     iris_t2 = obj.fit_transform(self.data)
     plt.scatter(iris_t2[:, 0], iris_t2[:, 1], c=self.c)
     plt.title('Using sklearn MDS')
     plt.show()
예제 #10
0
def non_param_multi_dim_scaling(dists, n_dims=3, n_threads=None, metric=True):
    mds = MDS(n_components=n_dims,
              metric=metric,
              n_jobs=n_threads,
              dissimilarity='precomputed')
    mds.fit(squareform(dists))
    projs = mds.embedding_
    res = {'stress': mds.stress_, 'projections': projs}
    return res
예제 #11
0
파일: mds_plot.py 프로젝트: RedHenLab/CDI
def md_scaling(co_matrix, is_distance_matrix=False):
    if not is_distance_matrix:
        distance_matrix = -np.log(co_matrix.matrix)
    else:
        distance_matrix = co_matrix

    mds = MDS(dissimilarity='precomputed')
    mds.fit(distance_matrix)
    return mds.embedding_
예제 #12
0
    def mds_sklearn(A, save_to_file=None):

        fig, ax = plt.subplots()
        mds = MDS(2, dissimilarity="precomputed")
        mds.fit(A)
        x = mds.embedding_[:, 0]
        y = mds.embedding_[:, 1]
        ax.scatter(x, y)
        if save_to_file is not None:
            fig.savefig(save_to_file)
        return fig
예제 #13
0
def mds():

    global df
    global normalized_df
    global stratifiedSample
    global clusterLabels
    global random_sample
    global finalSample

    stressForOriginalData = []
    stressForRandomSampleData = []
    stressForStratifiedSampleData = []

    for k in range(2, 5):
        md = MDS(n_components=k, dissimilarity='euclidean')
        components = md.fit(normalized_df)
        stressForOriginalData.append((k, md.stress_ / 100000))
    for k in range(2, 5):
        md = MDS(n_components=k, dissimilarity='euclidean')
        components = md.fit(random_sample)
        stressForRandomSampleData.append((k, md.stress_ / 100000))
    for k in range(2, 5):
        md = MDS(n_components=k, dissimilarity='euclidean')
        components = md.fit(finalSample)
        stressForStratifiedSampleData.append((k, md.stress_ / 100000))

    originalData = pd.DataFrame(stressForOriginalData,
                                columns=["xval", "yval"])
    odata = originalData.to_dict(orient='records')
    odata = json.dumps(odata, indent=2)

    randomData = pd.DataFrame(stressForRandomSampleData,
                              columns=["xval", "yval"])
    rdata = randomData.to_dict(orient='records')
    rdata = json.dumps(rdata, indent=2)

    stratData = pd.DataFrame(stressForStratifiedSampleData,
                             columns=["xval", "yval"])
    sdata = stratData.to_dict(orient='records')
    sdata = json.dumps(sdata, indent=2)

    columns = json.dumps({"xc": "MDS Components", "yc": "Stress"})
    numparams = json.dumps({"np": 6})
    data = {
        'plot_data': odata,
        'rdata': rdata,
        'sdata': sdata,
        'columns': columns,
        'nump': numparams
    }
    return render_template("index2.html", data=data)
예제 #14
0
파일: ordination.py 프로젝트: sagun98/zopy
def ordinate_sklearn( dist, method="mds" ):
    if method == "mds":
        Worker = MDS( metric=True, n_components=2, dissimilarity='precomputed', n_init=10, max_iter=1000 )
    elif method == "nmds":
        Worker = MDS( dissimilarity='precomputed', random_state=1701 )
    elif method == "tsne":
        Worker = TSNE( metric='precomputed', perplexity=50 )
    Worker.fit( dist )
    embedding = Worker.embedding_
    # estimate variance explained by each axis
    varexp = get_varexp( dist, embedding )
    # reorder dimensions to match varexp order
    index = sortedby( range( len( varexp ) ), varexp, reverse=True )
    embedding = embedding[:,index]   
    varexp.sort( reverse=True )
    return embedding, varexp, get_fit( dist, embedding )
def main():
    # load sample data
    data = np.loadtxt("distmat799.txt", delimiter=",")
    dists = data / np.amax(data)

    # load images
    img_files = [img for img in os.listdir("799_patch") if re.search(r"\.png", img)]

    # mds
    mds = MDS(n_components=2, dissimilarity="precomputed")
    results = mds.fit(dists)

    # plot
    fig, ax = plt.subplots()
    for i, img_file in enumerate(img_files):
        img_file = os.path.join("799_patch", img_file)
        img = read_png(img_file)
        imagebox = OffsetImage(img, zoom=2.0)
        coords = results.embedding_[i, :]
        xy = tuple(coords)
        ab = AnnotationBbox(imagebox, xy)
        ax.add_artist(ab)
    ax.set_xlim(-1.0, 1.0)
    ax.set_ylim(-1.0, 1.0)
    plt.show()
예제 #16
0
def mds(rdm):
    seed = np.random.RandomState(seed=3)

    mds = MDS(n_components=2,
              max_iter=3000,
              eps=1e-9,
              random_state=seed,
              dissimilarity="precomputed",
              n_jobs=1)
    pos = mds.fit(rdm.square).embedding_

    #nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
    #                    dissimilarity="precomputed", random_state=seed, n_jobs=1,
    #                    n_init=1)
    #npos = nmds.fit_transform(similarities, init=pos)

    # Rescale the data
    #    if patterns:
    #        pos *= np.sqrt((patterns ** 2).sum()) / np.sqrt((pos ** 2).sum())
    #npos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((npos ** 2).sum())

    # Rotate the data
    clf = PCA(n_components=2)
    pos = clf.fit_transform(pos)
    #npos = clf.fit_transform(npos)
    return pos
예제 #17
0
def plot_DF(df, N, metric, annotate=True, clusters=False, sizes=False):

    if metric:
        if metric != "cosine":
            df = df.div(df.sum(axis=0), axis=1)
        dist = pairwise_distances(df, metric=metric)
    else:
        dist = df
    mds = MDS(dissimilarity="precomputed", n_components=N)
    pos = mds.fit(dist).embedding_

    if N == 1:

        for x, y in zip(df.index, pos):
            plt.scatter(x, y)

    elif N == 2:
        if clusters:
            colors = get_colors(clusters)

        for l, x, y in zip(df.index, pos[:, 0], pos[:, 1]):

            if sizes:
                S = sizes[l]
            else:
                S = 10

            if clusters:
                plt.scatter(x, y, c=colors[l], s=S)
            else:
                plt.scatter(x, y, s=S)
            if annotate:
                plt.annotate(l, xy=(x, y))

    plt.show()
예제 #18
0
    def plot(self, x):
        self.ax.clear()
        mds = MDS(n_components=2, dissimilarity="precomputed")
        pos = mds.fit(x).embedding_

        self.ax.scatter(pos[:, 0], pos[:, 1], color='darkcyan')
        self.draw()
예제 #19
0
    def plot_mds(self, save_path=None):
        mds = MDS(n_components=2, max_iter=3000, eps=1e-9,
                  dissimilarity='precomputed', n_jobs=1,
                  random_state=42)
        pos = mds.fit(1 - self.W).embedding_

        plt.figure(figsize=(8, 8))
        ax = plt.axes([0., 0., 1., 1.])

        for (pos_x, pos_y), cls in zip(pos, self.class_nms):
            plt.text(pos_x - 0.03, pos_y - 0.03, cls, fontsize=25)

        segments = [[pos[i, :], pos[j, :]]
                    for i in range(len(pos)) for j in range(len(pos))]
        lc = LineCollection(segments,
                            zorder=0, cmap=plt.cm.Blues,
                            norm=plt.Normalize(0, 0.5))
        lc.set_linewidths(np.full(len(segments), 0.5))
        ax.add_collection(lc)
        plt.scatter(pos[:, 0], pos[:, 1], color='turquoise')
        plt.axis('off')
        if save_path != None:
            plt.savefig(save_path)
        plt.show()
        plt.close()
예제 #20
0
def plotMap(maparr, freq, nest, seqs, dbfile, map2d, outfile, plotm='T'):

    #mutli-dimensional scaling
    similarities = euclidean_distances(np.matrix(maparr))
    mds = MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=np.random.RandomState(seed=3), dissimilarity="precomputed", n_jobs=1)
    pos = mds.fit(similarities).embedding_

    #plot attributes
    N = len(pos)
    #size = [20*n for n in freq]
    size = 8000
    color = np.array(range(N))
    
    if str(plotm) == 'T':
    
        #plot MDS
        fig, ax = plt.subplots(figsize=(10,10))
        warnings.filterwarnings("ignore")
        scatter = ax.scatter(np.array(pos[:,0]), np.array(pos[:,1]), c=color, s=size, alpha=0.3, cmap=plt.cm.viridis, marker='s')
        plt.xlabel('Dimension 1', fontsize=20, labelpad=20)
        plt.ylabel('Dimension 2', fontsize=20, labelpad=20)
        #plt.axis([xmin, xmax, ymin, ymax])
        plt.tick_params(labelsize=15, length=14, direction='out', pad=15, top='off', right='off')

        #save figures
        fig.savefig(outfile + '.png', bbox_inches='tight', format='png')
        fig.savefig(outfile + '.pdf', bbox_inches='tight', format='pdf')
        plt.close(fig)
        warnings.resetwarnings()
        
        #write csv file
        writePlotMDS(freq, nest, seqs, dbfile, pos, maparr, map2d, outfile)

    return pos
예제 #21
0
def project_in_2D(distance_mat, method='mds'):
  """
  Project SDRs onto a 2D space using manifold learning algorithms
  :param distance_mat: A square matrix with pairwise distances
  :param method: Select method from 'mds' and 'tSNE'
  :return: an array with dimension (numSDRs, 2). It contains the 2D projections
     of each SDR
  """
  seed = np.random.RandomState(seed=3)

  if method == 'mds':
    mds = MDS(n_components=2, max_iter=3000, eps=1e-9,
              random_state=seed,
              dissimilarity="precomputed", n_jobs=1)

    pos = mds.fit(distance_mat).embedding_

    nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
               dissimilarity="precomputed", random_state=seed,
               n_jobs=1, n_init=1)

    pos = nmds.fit_transform(distance_mat, init=pos)
  elif method == 'tSNE':
    tsne = TSNE(n_components=2, init='pca', random_state=0)
    pos = tsne.fit_transform(distance_mat)
  else:
    raise NotImplementedError

  return pos
def main():
    # load sample data
    data = np.loadtxt('distmat799.txt', delimiter=',')
    dists = data / np.amax(data)

    # load images
    img_files = [
        img for img in os.listdir('799_patch') if re.search(r'\.png', img)
    ]

    # mds
    mds = MDS(n_components=2, dissimilarity='precomputed')
    results = mds.fit(dists)

    # plot
    fig, ax = plt.subplots()
    for i, img_file in enumerate(img_files):
        img_file = os.path.join('799_patch', img_file)
        img = read_png(img_file)
        imagebox = OffsetImage(img, zoom=2.0)
        coords = results.embedding_[i, :]
        xy = tuple(coords)
        ab = AnnotationBbox(imagebox, xy)
        ax.add_artist(ab)
    ax.set_xlim(-1.0, 1.0)
    ax.set_ylim(-1.0, 1.0)
    plt.show()
예제 #23
0
def plot_clusters(scaled_features, cluster_obj):

    labels = cluster_obj.labels_
    clusters = len(labels)

    norm = Normalize(min(labels), max(labels))
    cm = mpl.cm.jet

    mds = MDS(n_components=2)
    res = mds.fit(scaled_features)

    pos = res.embedding_
    offset_radius = 10
    cluster_thetas = np.linspace(0, 2 * np.pi, clusters + 1)[0:clusters]
    cluster_vectors = [(offset_radius * np.cos(theta),
                        offset_radius * np.sin(theta))
                       for theta in cluster_thetas]

    for i, coords in enumerate(pos):
        label = labels[i]
        color = cm(norm(label))
        offset = cluster_vectors[label]
        mpl.plot(coords[0] + offset[0],
                 coords[1] + offset[1],
                 color=color,
                 marker='o')

    mpl.show()
예제 #24
0
def labtest_MDS(PID):
    data = [patients[pid]['tests'] for pid in PID]
    X = pp.scale(data)
    mds = MDS(n_components = 2, metric = True, n_init = 4, max_iter = 300, verbose = 0, eps = 0.001, n_jobs = 1, dissimilarity = 'euclidean')
    pos = mds.fit(X).embedding_
    
    return pos
예제 #25
0
def w2c_mds_dec(data, dim=2):
    mds = MDS(n_components=dim,
              max_iter=3000,
              eps=1e-9,
              random_state=seed,
              dissimilarity='euclidean',
              n_jobs=1)
    return mds.fit(data).embedding_
예제 #26
0
    def cluster(D, k=3, verbose=False):
        """Cluster LDS's via Multi-Dimensional Scaling and KMeans.

        Strategy:
            1. Build NxN matrix of pairwise similarities
            2. Run MDS to embed data in R^2
            3. Run KMeans with k cluster centers
            4. Find samples closest to the k centers

        Paramters:
        ----------
        D: numpy.ndarray, shape = (N, N)
            Precomputed distance matrix.

        k: int (default: 3)
            Number of desired cluster centers.

        verbose: boolean
            Enable verbose output.

        Returns:
        --------
        eData: numpy.ndarray, shape (N, k)
            N d-dimensional samples embedded in R^d.

        ids: numpy.ndarray, shape = (k,)
            List of indices identifying the k representatives.
        """

        assert D.shape[0] == D.shape[1], "OOps (distance matrix not square)!"

        # build MDS for precomputed similarity matrix
        mds = MDS(metric=True,
                  n_components=2,
                  verbose=True,
                  dissimilarity="precomputed")

        def __symmetrize(A):
            return A + A.T - np.diag(A.diagonal())

        # run MDS on symmetrized similarity matrix
        eData = mds.fit(__symmetrize(D)).embedding_

        kmObj = KMeans(k)
        kmObj.fit_predict(eData)

        ids = np.zeros((k, ), dtype=np.int)
        for i in range(k):
            # sanity check
            cDat = eData[np.where(kmObj.labels_ == i)[0], :]
            assert len(cDat) > 0, "Oops, empty cluster ..."

            kCen = kmObj.cluster_centers_[i, :]
            x = euclidean_distances(eData, kCen)
            ids[i] = int(np.argsort(x.ravel())[0])

        # return distance matrix and ID's of representative LDS's
        return (eData, ids)
예제 #27
0
파일: ml.py 프로젝트: nted92/natools
def get_training_set_2d_coordinates(distance_matrix,
                                    labels,
                                    random_state=None):
    """
    Other approach: t-SNE?
    """
    training_coordinates = MDS(n_components=2,
                               random_state=random_state,
                               dissimilarity='precomputed')
    training_coordinates.fit(distance_matrix)

    df = pd.DataFrame(
        dict(x=training_coordinates.embedding_[:, 0],
             y=training_coordinates.embedding_[:, 1],
             label=labels))
    # note: the stress_ is the sum of squared distance of the
    # disparities and the distances for all constrained points
    return df, training_coordinates.stress_
예제 #28
0
파일: skill_util.py 프로젝트: onyilam/onet
def mds(df, value='Data Value', n_dimension=2):
    tmp = pd.merge(df, jobzones_23, on=['O*NET-SOC Code'])
    #examine the level or importance
    temp = tmp[tmp['Scale ID'] == 'IM']
    temp = temp.pivot_table(index=['O*NET-SOC Code', 'Job Zone'],
                            columns='Element Name',
                            values=value).reset_index()
    columns = temp.columns.tolist()
    features = [
        str(col) for col in columns
        if col not in ['Title', 'O*NET-SOC Code', 'Job Zone']
    ]
    x = temp.loc[:, features].values
    x = StandardScaler().fit_transform(x)
    #get the distance between jobs
    t = np.dot(x, np.transpose(x))
    mds = MDS(n_components=n_dimension,
              max_iter=3000,
              eps=1e-9,
              random_state=12345,
              dissimilarity="precomputed",
              n_jobs=1)
    pos = mds.fit(t).embedding_
    # select the top 2 dimensions of data
    clf = PCA(n_components=2)
    pos = clf.fit_transform(pos)
    finalDf = pd.concat(
        [pd.DataFrame(pos), temp[['O*NET-SOC Code', 'Job Zone']]], axis=1)
    finalDf.rename(columns={0: 'PC1', 1: 'PC2'}, inplace=True)
    #plot the graphs
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111)
    s = 100

    #plt.scatter(pos[:, 0], pos[:, 1], color='blue', s=s, lw=0)
    ax.scatter(finalDf['PC1'], finalDf['PC2'], color='blue', s=50)
    finalDf = finalDf.reindex(finalDf.PC1.abs().sort_values().index)
    largest_gap = finalDf.tail(20)

    #for i, txt in enumerate(largest_gap['Title']):
    #   ax.annotate(txt, (largest_gap[largest_gap['Title']==txt]['PC1'], largest_gap[largest_gap['Title']==txt]['PC2']))

    plt.ylim(-8, 8)
    plt.xlim(-30, 30)
    plt.title('Polarization measured by Distance between Jobs')

    temp = tmp[tmp['Scale ID'] == 'IM']
    temp2 = temp.pivot_table(index=['O*NET-SOC Code'],
                             columns='Element Name',
                             values='Data Value').reset_index()
    df = pd.merge(temp2, finalDf, on=['O*NET-SOC Code'])
    df = df.corr()
    #df.rename(columns={0: 'principal component 1', 1: 'principal component 2'}, inplace=True)
    #df.columns = ['principal component 1', 'principal component 2','Title','O*NET-SOC Code','Job Zone' ]
    print df[['PC1']].sort_values('PC1', ascending=False).head(6)
    print df[['PC2']].sort_values('PC2', ascending=False).head(6)
예제 #29
0
    def cluster(D, k=3, verbose=False):
        """Cluster LDS's via Multi-Dimensional Scaling and KMeans.

        Strategy:
            1. Build NxN matrix of pairwise similarities
            2. Run MDS to embed data in R^2
            3. Run KMeans with k cluster centers
            4. Find samples closest to the k centers

        Paramters:
        ----------
        D: numpy.ndarray, shape = (N, N)
            Precomputed distance matrix.

        k: int (default: 3)
            Number of desired cluster centers.

        verbose: boolean
            Enable verbose output.

        Returns:
        --------
        eData: numpy.ndarray, shape (N, k)
            N d-dimensional samples embedded in R^d.

        ids: numpy.ndarray, shape = (k,)
            List of indices identifying the k representatives.
        """

        assert D.shape[0] == D.shape[1], "OOps (distance matrix not square)!"

        # build MDS for precomputed similarity matrix
        mds = MDS(metric=True, n_components=2, verbose=True,
                  dissimilarity="precomputed")

        def __symmetrize(A):
            return A + A.T - np.diag(A.diagonal())

        # run MDS on symmetrized similarity matrix
        eData = mds.fit(__symmetrize(D)).embedding_

        kmObj = KMeans(k)
        kmObj.fit_predict(eData)

        ids = np.zeros((k,), dtype=np.int)
        for i in range(k):
            # sanity check
            cDat = eData[np.where(kmObj.labels_ == i)[0],:]
            assert len(cDat) > 0, "Oops, empty cluster ..."

            kCen = kmObj.cluster_centers_[i,:]
            x = euclidean_distances(eData, kCen)
            ids[i] = int(np.argsort(x.ravel())[0])

        # return distance matrix and ID's of representative LDS's
        return (eData, ids)
예제 #30
0
 def nmds_function(matrix, dimensions):
     nmds = MDS(n_components=dimensions,
                metric=False,
                dissimilarity='precomputed',
                max_iter=int(max_iter_val),
                n_init=int(n_init_val))
     nmds_results = nmds.fit(jaccard_dm[:100])
     stress = round(nmds_results.stress_, 2)
     nmds_array = nmds_results.embedding_
     return ({"stress": stress, "nmds_results": nmds_array})
예제 #31
0
def timeline_scatter_plot(X, time_index, method='MDS', metric='cosine', **kwargs):
    if not isinstance(time_index, pd.DatetimeIndex):
        time_index = pd.DatetimeIndex(time_index)
    dm = pairwise_distances(X, metric=metric)
    if method.upper() == 'MDS':
        decomposer = MDS(n_components=2, dissimilarity='precomputed', verbose=1, **kwargs)
        decomposer.fit(dm)
    elif method.upper() == 'TSNE':
        decomposer = TSNE(n_components=2, metric='precomputed', verbose=1, **kwargs)
        decomposer.fit(dm)
    else:
        raise ValueError("Method %s is not supported..." % method)
    X, Y = decomposer.embedding_[:,0], decomposer.embedding_[:,1]
    unique_index = time_index.unique().order()
    colormap = {time_stamp: color for time_stamp, color in zip(
        unique_index, sns.cubehelix_palette(unique_index.shape[0]))}
    colors = [colormap[time_stamp] for time_stamp in time_index]
    sns.plt.scatter(X, Y, s=40, color=colors, alpha=0.7)
    sns.plt.axis('off')
예제 #32
0
def get_mds(similarities):
    seed = np.random.RandomState(seed=3)
    print(np.amax(similarities))
    print(np.amin(similarities))
    nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
                    dissimilarity="precomputed", random_state=seed, n_jobs=1,
                    n_init=1)
    pos = nmds.fit(similarities).embedding_
    X=np.array(pos)
    return X
예제 #33
0
def embedDistanceMatrix(dmatDf, method='kpca', n_components=2, **kwargs):
    """Two-dimensional embedding of sequence distances in dmatDf,
    returning Nx2 x,y-coords: tsne, isomap, pca, mds, kpca, sklearn-tsne"""
    if isinstance(dmatDf, pd.DataFrame):
        dmat = dmatDf.values
    else:
        dmat = dmatDf

    if method == 'tsne':
        xy = tsne.run_tsne(dmat, no_dims=n_components, perplexity=kwargs['perplexity'])
    elif method == 'isomap':
        isoObj = Isomap(n_neighbors=10, n_components=n_components)
        xy = isoObj.fit_transform(dmat)
    elif method == 'mds':
        mds = MDS(n_components=n_components,
                  max_iter=3000,
                  eps=1e-9,
                  random_state=15,
                  dissimilarity="precomputed",
                  n_jobs=1)
        xy = mds.fit(dmat).embedding_
        rot = PCA(n_components=n_components)
        xy = rot.fit_transform(xy)
    elif method == 'pca':
        pcaObj = PCA(n_components=None)
        xy = pcaObj.fit_transform(dmat)[:, :n_components]
    elif method == 'kpca':
        pcaObj = KernelPCA(n_components=dmat.shape[0], kernel='precomputed', eigen_solver='dense')
        try:
            gram = dist2kernel(dmat)
        except:
            print('Could not convert dmat to kernel for KernelPCA; using 1 - dmat/dmat.max() instead')
            gram = 1 - dmat / dmat.max()
        xy = pcaObj.fit_transform(gram)[:, :n_components]
    elif method == 'lle':
        lle = manifold.LocallyLinearEmbedding(n_neighbors=30, n_components=n_components, method='standard')
        xy = lle.fit_transform(dist)
    elif method == 'sklearn-tsne':
        tsneObj = TSNE(n_components=n_components, metric='precomputed', random_state=0, perplexity=kwargs['perplexity'])
        xy = tsneObj.fit_transform(dmat)
    elif method == 'umap':
        umapObj = umap.UMAP(n_components=n_components, metric='precomputed', **kwargs)
        xy = umapObj.fit_transform(dmat)
    else:
        print('Method unknown: %s' % method)
        return

    assert xy.shape[0] == dmatDf.shape[0]
    xyDf = pd.DataFrame(xy[:, :n_components], index=dmatDf.index, columns=np.arange(n_components))
    if method == 'kpca':
        """Not sure how negative eigenvalues should be handled here, but they are usually
        small so it shouldn't make a big difference"""
        setattr(xyDf, 'explained_variance_', pcaObj.lambdas_[:n_components]/pcaObj.lambdas_[pcaObj.lambdas_>0].sum())
    return xyDf
def create_mds(dissim_mat, embed_dimensions, metric=True, init_from_isomap=True):
    max_iter = 10000 if not get_setting("DEBUG") else 100
    if not init_from_isomap:
        warnings.warn("sklearn's MDS is broken!! Have to init from something, don't f*****g ask why!")
        n_inits = math.ceil((max(get_ncpu()*2, (10 if not get_setting("DEBUG") else 3)))/get_ncpu())*get_ncpu() # minimally 10, maximally ncpu*2, but in any case a multiple of ncpu
        print(f"Running {'non-' if not metric else ''}metric MDS {n_inits} times with {get_ncpu(ignore_debug=True)} jobs for max {max_iter} iterations.")
        embedding = MDS(n_components=embed_dimensions, dissimilarity="precomputed",
                        metric=metric, #TODO with metric=True it always breaks after the second step if  n_components>>2 (well, mit metric=False auch^^)
                        n_jobs=get_ncpu(ignore_debug=True), verbose=1 if get_setting("VERBOSE") else 0, n_init=n_inits, max_iter=max_iter)
        mds = embedding.fit(dissim_mat)
    else:
        print(f"Running {'non-' if not metric else ''}metric MDS with {get_ncpu(ignore_debug=True)} jobs for max {max_iter} iterations, initialized from Isomap-Embeddings")
        embedding = MDS(n_components=embed_dimensions, dissimilarity="precomputed", metric=metric,
                        n_jobs=get_ncpu(ignore_debug=True), verbose=1 if get_setting("VERBOSE") else 0, n_init=1, max_iter=max_iter)
        try:
            isomap_init = create_isomap(dissim_mat, embed_dimensions, neighbor_factor=25).embedding_
        except ValueError: #There are significant negative eigenvalues...
            isomap_init = np.random.random((len(dissim_mat), embed_dimensions))*0.01
        mds = embedding.fit(dissim_mat, init=isomap_init)
    return mds
def mds_util(k, metric):
    mds = MDS(n_components=k, metric=metric,\
        max_iter=1000, eps=1e-9, dissimilarity="precomputed",\
        n_jobs=1, random_state=3)

    mds_fit_out = mds.fit(cars_od)

    return  {
        'stress': mds_fit_out.stress_,
        'embedding': mds_fit_out.embedding_
    }
예제 #36
0
class MDS_Reducer(Reducer):
    '''The multidimensional scaling (MDS) reduction method'''
    def __init__(self, dimensionality=2500, seed=None):
        rnd_state = np.random.RandomState(seed=seed)
        self.mds = MDS(n_components=dimensionality,
                       n_jobs=-1,
                       random_state=rnd_state,
                       dissimilarity="precomputed")

    def reduced(self, A):
        embd = self.mds.fit(A).embedding_
        return np.transpose(embd)
예제 #37
0
def make_mds_image(m, filename, labels=None, colour=None):
    """Given a matrix of distances, project into 2D space using
    multi-dimensional scaling and produce an image."""

    mds_data_filename = filename + ".dat"

    try:
        # if we've previously computed, load it
        p = np.genfromtxt(mds_data_filename)
    except:
        # else, compute it now (and save)

        # Construct MDS object with various defaults including 2d
        mds = MDS(dissimilarity="precomputed")
        # Fit
        try:
            f = mds.fit(m)
        except ValueError as e:
            print("Can't run MDS for " + filename + ": " + str(e))
            return

        # Get the embedding in 2d space
        p = f.embedding_

        # save
        np.savetxt(mds_data_filename, p)

    # Make an image
    fig, ax = plt.subplots(figsize=(5, 5))
    # x- and y-coordinates
    ax.set_aspect('equal')

    ax.scatter(p[:, 0], p[:, 1], edgecolors='none')

    if labels != None:
        print filename
        # hard-coded for GP depth-2
        indices = [0, 2, 50, 52]
        for i in indices:
            print labels[i], p[i, 0], p[i, 1]
            # can print some labels directly on the graph as follows,
            # but maybe it's better done manually, after printing
            # their locations to terminal?

            # plt.text(p[i,0], p[i,1], labels[i], style='italic',
            #         bbox={'facecolor':'red', 'alpha':0.5, 'pad':10})

    fig.savefig(filename + ".pdf")
    fig.savefig(filename + ".eps")
    fig.savefig(filename + ".png")
    plt.close(fig)
예제 #38
0
def make_mds_image(m, filename, labels=None, colour=None):
    """Given a matrix of distances, project into 2D space using
    multi-dimensional scaling and produce an image."""

    mds_data_filename = filename + ".dat"

    try:
        # if we've previously computed, load it
        p = np.genfromtxt(mds_data_filename)
    except:
        # else, compute it now (and save)
        
        # Construct MDS object with various defaults including 2d
        mds = MDS(dissimilarity="precomputed")
        # Fit
        try:
            f = mds.fit(m)
        except ValueError as e:
            print("Can't run MDS for " + filename + ": " + str(e))
            return

        # Get the embedding in 2d space
        p = f.embedding_

        # save
        np.savetxt(mds_data_filename, p)

    # Make an image
    fig, ax = plt.subplots(figsize=(5, 5))
    # x- and y-coordinates
    ax.set_aspect('equal')

    ax.scatter(p[:,0], p[:,1], edgecolors='none')

    if labels != None:
        print filename
        # hard-coded for GP depth-2
        indices = [0, 2, 50, 52]
        for i in indices:
            print labels[i], p[i,0], p[i,1]
            # can print some labels directly on the graph as follows,
            # but maybe it's better done manually, after printing
            # their locations to terminal?

            # plt.text(p[i,0], p[i,1], labels[i], style='italic',
            #         bbox={'facecolor':'red', 'alpha':0.5, 'pad':10})

    fig.savefig(filename + ".pdf")
    fig.savefig(filename + ".eps")
    fig.savefig(filename + ".png")
    plt.close(fig)
예제 #39
0
def plot_mds(points, genres, n_points=500):
    '''
    Plots a set of documents in MDS space

    Args:
        points: dense array with coordinates of each document
        genres: list of genres for each entry in points
    Returns:
        None
    '''

    genres = np.array(genres)
    genre_sel = np.not_equal(genres, None)
    X, y = points[genre_sel], genres[genre_sel]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, train_size=n_points)

    distances = cosine_distances(X_train, X_train)
    mds = MDS(n_components=2, dissimilarity='precomputed')
    mds.fit(distances)

    plot_embedding(mds.embedding_, y_train)
예제 #40
0
def calc_MDS_corr():
    mds_corr = MDS(n_components=2, max_iter=3000, eps=1e-9,
                   dissimilarity="precomputed", n_jobs=1)
    # print(data.corr())
    similarities_corr = np.array(1-abs(data.corr()))
    
    pos_corr = mds_corr.fit(similarities_corr).embedding_
    pos_corr_df = pd.DataFrame.from_records(pos_corr, columns=['x','y'])

    pos_corr_df["labels"] = list(data.columns)
    short_names = ['DT', 'CRS_DT', 'AT', 'CRS_AT', 'F_No', 'ActET', 'CRS_ET', 'ArrD', 'DepD', 'Dis']
    
    pos_corr_df['short_names'] = short_names
    
    return json.dumps(pos_corr_df.to_dict(orient="records"))
예제 #41
0
파일: mds.py 프로젝트: waldol1/formCluster
def reduction(simMat,N=2):

    #change similarity matrix into dissimilarity matrix
    dis = map(lambda x: map(lambda y: 1-y, x), simMat)
    #dis = dist(simMat)
    #dis = simMat

    #configure MDS to run 10 times. Also specify that data will be a dissimilarity matrix
    mds = MDS(n_components=N, n_init=10,max_iter=3000, metric=True, dissimilarity="precomputed")
    mat = np.array(dis)
    
    #Run MDS
    fit = mds.fit(mat)
    print "Approximate Stress:", fit.stress_
    print "Stress:", stress(dis, fit.embedding_)

    return fit.embedding_
예제 #42
0
def mult_scl(X, labels):
    print('labels:')
    for i, label in zip(range(1, len(labels) + 1), labels):
        print('{}: {}'.format(i, label))

    isomap = Isomap()
    points = isomap.fit(np.nan_to_num(X)).embedding_
    f, (ax1, ax2, ax3) = plt.subplots(1, 3)
    plot_location(labels, ax3)
    ax1.scatter(points[:, 0], points[:, 1], s=20, c='r')
    ax1.set_title('Isomap')
    add_labels(labels, points, ax1)

    mds = MDS()
    points = mds.fit(np.nan_to_num(X)).embedding_
    ax2.scatter(points[:, 0], points[:, 1], s=20, c='g')
    ax2.set_title('MDS')
    add_labels(labels, points, ax2)

    plt.show()
예제 #43
0
파일: clustering.py 프로젝트: DimosGu/nba
def plot_clusters(scaled_features, cluster_obj):

    labels = cluster_obj.labels_
    clusters = len(labels)

    norm = Normalize(min(labels), max(labels))
    cm = mpl.cm.jet

    mds = MDS(n_components=2)
    res = mds.fit(scaled_features)

    pos = res.embedding_
    offset_radius = 10
    cluster_thetas = np.linspace(0, 2 * np.pi, clusters + 1)[0:clusters]
    cluster_vectors = [(offset_radius * np.cos(theta), offset_radius * np.sin(theta)) for theta in cluster_thetas]

    for i, coords in enumerate(pos):
        label = labels[i]
        color = cm(norm(label))
        offset = cluster_vectors[label]
        mpl.plot(coords[0] + offset[0], coords[1] + offset[1], color=color, marker='o')

    mpl.show()
예제 #44
0
def clustered_mds(cds, clusters=None, filename=None):    

    num_subj = cds.shape[0]
    num_voxels = cds.shape[1]
    clusters = cds.a.event_bounds
    num_clusters = len(clusters)
    ds_list = np.zeros((num_subj, num_voxels, num_clusters-1))
    prev_cutoff = 0
    ds_tup = ()
    
    # average correlations for each scene
    for i in range(num_clusters - 1):
        ds_list[:,:,i] = np.mean(cds.samples[:,:,clusters[i]:clusters[i+1]], axis=2)
       
    dsm_array = []    
    for subj in ds_list:        
        dsm_array.append(squareform(1 - pdist(subj.T, metric='correlation')))
        
    dsm = np.mean(dsm_array, axis=0)
    mds = MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity="precomputed", n_jobs=1)
    coords = mds.fit(dsm).embedding_
    
    plt.clf()
    X, Y = coords[:,0], coords[:,1]
    labels = np.arange(1,num_clusters)
    fig = plt.figure(figsize=(10,8))
    ax = fig.add_subplot(111)
    plt.scatter(X,Y, marker='x')
    for i, label in enumerate(np.arange(1,num_clusters)):
        ax.annotate(label, (X[i],Y[i]))    
        
    plt.axis([np.min(X)*1.2, np.max(X)*1.2, np.min(Y)*1.2, np.max(Y)*1.2])
    plt.title("MDS Scene Visualization")
    plt.show()
    
    return dsm
예제 #45
0
def multidimensional_scaling(rdm, labels):

    # perform multidimensional scaling
    mds = MDS(
        n_components=2,
        max_iter=3000,
        dissimilarity='precomputed'
    )

    positions = mds.fit(rdm).embedding_
    positions /= positions.max()

    # visualize the embedding in a figure
    figure = plt.figure(1)
    ax = plt.axes([0., 0., 1., 1.])

    plt.scatter(positions[:, 0], positions[:, 1])

    # plot the edges
    segments = [[positions[i, :], positions[j, :]] for i in range(len(positions)) for j in range(len(positions))]
    values = np.abs(rdm)
    lc = LineCollection(
        segments,
        zorder=0,
        cmap=plt.cm.YlGnBu,
        norm=plt.Normalize(0, values.max())
    )
    lc.set_array(rdm.flatten())
    lc.set_linewidths(2 * np.ones(len(segments)))
    ax.add_collection(lc)

    # add labels
    for index, label in enumerate(labels):
        plt.annotate(label, (positions[index, 0], positions[index, 1]))

    plt.show()
def calculate_and_cluster():
    
    # Variables for storing the data
    data_list = {}
    tag_list = {}
    tag_map = {}
    data_tag_map = {}
    counter = 0
    index = 0
    ptr = ""

    # Parse the CSV file (this will be denoted by a string variable)
    with open('../../data/sets/complete_set.csv','rb') as csvfile:
        reader = csv.reader(csvfile,delimiter=',')
        for row in reader:
            data_list[counter] = ''.join(row)
            counter +=1
    counter = 0
    
    # Loop through data in range
    for data in range(0,len(data_list)):
        # Split the last token in the string
        split = data_list[data].split(" ")[-1:]
        # print split[0], "Tag set: ", get_tag_set(split[0])
        data_tag_map[split[0]] = get_tag_set(split[0])
    od = OrderedDict(sorted(data_tag_map.items()))
    
    names = []
    data_tagged_list = {}
    counter = 0
    for key, value in od.iteritems():
        # Maintain old file name
        file_old = str(counter) + '.txt'
        
        tag = ''
        if len(value) == 1:
            tag = 'Tagged'
            names.append(str(counter) + "_" + tag)
            data_tagged_list[str(counter)] = True
        else:
            tag = 'Untagged'
            names.append(str(counter) + "_" + tag)
            data_tagged_list[str(counter)] = False
            
        # Create new file name with tagged / untagged appended
        file_new = str(counter) + '_' + tag + '.txt'
        # Rename the file for later use in color co-ordination
        rename_file(file_old,file_new)
        counter += 1
    
    
    dataNodes = []
    for x in range(0,len(data_list)):
        dataNodes.append(data_list[x])
    vect = TfidfVectorizer(min_df=1)

    tfidf = vect.fit_transform(dataNodes)
    X = genfromtxt('../semantic_similarity_algorithms/semantic_similarity_matrix/matrix.csv', delimiter=',')
    X = symmetrize(X)
    print (X.transpose() == X).all()
    # N Components: plotting points in a two-dimensional plane
    # Dissimilirity: "precomputed" because of the Distance Matrix
    # Random state is fixed so we can reproduce the plot.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    mds.fit(X.astype(np.float64))
    pos = mds.fit_transform(X)  # shape (n_components, n_samples)
    xs, ys = pos[:, 0], pos[:, 1]


    # Set figure size to have dimensions of at least 15 inches for the width.
    # Height can be scaled accordingly.
    plt.figure(figsize=(15,8))
    plt.subplot(211)
    
    # Loop through the points, label approriately and scatter
    # Ensure figure size has enough room for legend plotting. Each plot must have a label.
    # In this case, label is the split value denoting the POI tag
    
    for x, y, name in zip(xs, ys, names):
        plt.scatter(x, y, s=100,c=get_colour_tag(name.split('_',1)[1]), label = name.split('_',1)[1])
        #plt.text(x,y,name.split('_',1)[0])
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    legend = plt.legend(by_label.values(), by_label.keys(),loc='lower center',ncol=4,bbox_to_anchor=(0.5, -0.6))
    
    plt.show()
예제 #47
0
        for face in [0, 1, 2]:
            csvs = stats_dict[stat]["Fiducials"][face]
            for (i, j) in combinations(fiducials, 2):
                dist1 = read_csv(csvs[i], index_col=0).values
                dist2 = read_csv(csvs[j], index_col=0).values

                # Symmeterize
                dist1 += dist1.T
                dist2 += dist2.T

                # Run MDS and map to lower dim. Try 2 for visualizing.
                mds_1 = MDS(
                    n_components=2, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1
                )
                pos_1 = mds_1.fit(dist1).embedding_

                mds_2 = MDS(
                    n_components=2, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1
                )
                pos_2 = mds_2.fit(dist2).embedding_

                output = procrustes_analysis(pos_1, pos_2, nperm=10)

                proc_results[:, face, i, j] = output

            proc_fiducials[stat][face] = {"value": proc_results[0, face, :, :], "pvals": proc_results[1, face, :, :]}

            # Now compare fiducials to designs.
            csv_designs = stats_dict[stat]["Designs"][face]
    'Laboratorium fizyki 2': 'Physics Laboratory 2',
    'Analiza matematyczna 1': 'Mathematical Analysis 1',
    'Mechanika': 'Mechanics'
}

cl = 'L'

co_corr = np.corrcoef(win.getData(class_=cl), rowvar=0)
labels = [pl_en[x] for x in win.getCoursesNames()]

mds = MDS(n_components=2, dissimilarity='precomputed')
dists = np.empty((len(co_corr), len(co_corr)))
for ii in range(len(labels)):
    for jj in range(len(labels)):
        dists[ii][jj] = math.sqrt(2 * (1 - co_corr[ii][jj]))
pos = mds.fit(dists).embedding_

G = nx.Graph()
G.add_nodes_from(range(len(labels)))
textstr = ""
for ii, l in enumerate(labels):
    textstr += str(ii) + " - " + l + "\n"
    for jj in range(ii + 1, len(labels)):
        d = dists[ii][jj]
        G.add_edge(ii, jj, weight=d)

si = []
for n, nbrs in G.adjacency_iter():
    w = 0
    for nbr, eattr in nbrs.items():
        w += 1 / eattr['weight']
 def getMDS(self, featureMatrix, dist=None):
     if dist is None:
         dist = 1-cosine_similarity(featureMatrix)
     mds = MDS(n_components=2, dissimilarity="precomputed", random_state=6)
     results = mds.fit(dist)
     return results.embedding_
# CLUSTERING
# Create KMeans
kmeans = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1)
# Train KMeans
kmeans.fit(data)

# Get the results
kmeans_labels = kmeans.labels_
kmeans_cluster_centers = kmeans.cluster_centers_
kmeans_labels_unique = np.unique(kmeans_labels)

####################################
# PLOT PREPARATION
# Reduce to two dimensions for plotting
mds = MDS(n_components=2)
mds.fit(data)
scaled_coordinates = mds.embedding_

# PLOT ON TWO DIMENSIONS
labelled_data_x = (dict(), dict())
labelled_data_y = (dict(), dict())
for label in kmeans_labels_unique:
    labelled_data_x[0][label] = []
    labelled_data_y[0][label] = []
    labelled_data_x[1][label] = []
    labelled_data_y[1][label] = []

for i in range(0, len(names)):
    label = kmeans_labels[i]
    labelled_data_x[survived[i]][label].append(scaled_coordinates[i][0])
    labelled_data_y[survived[i]][label].append(scaled_coordinates[i][1])
########################

dimensions = np.arange(2, 30, 1)
stress_vector = np.zeros_like(dimensions)


for i, dim in enumerate(dimensions):

    # Define classifier
    n_comp = dim
    max_iter = 1000
    eps = 1e-9
    mds = MDS(n_components=n_comp, max_iter=max_iter, eps=eps,
              n_jobs=2, dissimilarity='precomputed')

    x = mds.fit(distances)
    stress = x.stress_ / distances.shape[0]
    print 'Dimension', dim
    print 'The stress is', stress
    stress_vector[i] = stress

########################
# Plot Here
########################

# Plot parameters
fontsize = 20
figsize = (16, 12)
axes_position = [0.1, 0.1, 0.8, 0.8]
title = 'Stress vs size of embedding space'
xlabel = 'Dimension'
예제 #52
0
    def __plot_samples__(self, dfs, fold):
        """
        :type dfs: List[pandas DataFrame]      # [training df, testing df]
        :type fold: int
        :rtype: None
        """

        mds  = MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity='euclidean', n_jobs=-1)
        tsne = TSNE(n_components=2)

        # change label to color index
        #   author 1 train (0 = light blue), author 1 test (1 = dark blue)
        #   author 2 train (2 = light green), author 2 test (3 = dark green)
        df_all = pd.DataFrame(columns = dfs[0].columns)
        df0_copy = dfs[0].copy()
        df0_copy.loc[(df0_copy.label ==  1).values, 'label'] = 0
        df0_copy.loc[(df0_copy.label == -1).values, 'label'] = 2
        df_all = df_all.append(df0_copy)

        df1_copy = dfs[1].copy()
        df1_copy.loc[(df1_copy.label ==  1).values, 'label'] = 1
        df1_copy.loc[(df1_copy.label == -1).values, 'label'] = 3
        df_all = df_all.append(df1_copy)

        legend = {0: 'Author 1 Training Sample',
                  1: 'Author 1 Test Sample',
                  2: 'Author 2 Training Sample' ,
                  3: 'Author 2 Test Sample' }

        # fit on training data
        pos_lst = [('Multi-Dimensional Scaling (MDS)',
                    mds.fit(df_all.drop('label', axis=1)).embedding_),

                   ('t-Distributed Stochastic Neighbor Embedding (TSNE)',
                    tsne.fit(df_all.drop('label', axis=1)).embedding_)]


        # plot
        colors = sns.color_palette('Paired', 4)
        fig = plt.figure(figsize=(16,7))

        plt.hold(True)
        for k, (title, pos) in enumerate(pos_lst, 1):

            ## fig.add_subplot() works in ipython notebook but creates a
            ## mysterious 3rd axes in python...
            # ax = fig.add_subplot(1,2,k)

            ax = plt.subplot(1,2,k)
            ax.set_title(title)

            for i in xrange(len(colors)):
                samples = pos[(df_all.label == i).values, :]
                ax.scatter(samples[:,0], samples[:,1],
                           c=colors[i], edgecolor='none',
                           label=legend[i])
            ax.legend()

        plt.hold(False)

        plt.savefig('../figs/' + \
                   self.__PG_STATS_TBL__[self.__PG_STATS_TBL__.find("_")+1:] + \
                   'fold' + str(fold) + '.png',
                   dpi=300, transparent=True)

        plt.close(fig)
예제 #53
0
    def view_2d_embedding(self, reference=None):
        # http://baoilleach.blogspot.co.at/2014/01/convert-distance-matrix-to-2d.html

        if reference is None:
            # First cluster all structures based on pairwise RMSD
            db = self._cluster_dbscan()
            labels = db.labels_
            core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
            core_samples_mask[db.core_sample_indices_] = True
            unique_labels = set(labels)

            # Then calculate the 2D coordinates for our embedding
            mds = MDS(n_components=2,
                      dissimilarity="precomputed", random_state=6)
            results = mds.fit(self._rmsd)
            coords = results.embedding_

            # Now plot
            plt.plot(coords[:, 0], coords[:, 1], '-', color="blue")

            colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
            for k, col in zip(unique_labels, colors):
                if k == -1:
                    # Black used for noise.
                    col = 'k'
                class_member_mask = (labels == k)

                plt.plot(coords[:, 0][class_member_mask & core_samples_mask],
                         coords[:, 1][class_member_mask & core_samples_mask],
                         'o', markerfacecolor=col, markeredgecolor='k', markersize=6
                         )
                plt.plot(coords[:, 0][class_member_mask & ~core_samples_mask],
                         coords[:, 1][class_member_mask & ~core_samples_mask],
                         'o', markerfacecolor=col, markeredgecolor=col, markersize=1
                         )
            plt.savefig("embedding_{}.svg".format(self._cgs[0].name))
            plt.clf()
            plt.close()
        else:
            # Create a huge distance matrix
            alldists = np.zeros(
                ((len(self._cgs) + len(reference) + 1), (len(self._cgs) + len(reference) + 1)))
            for i, j in it.combinations(range(len(alldists)), 2):
                if i < len(self._cgs):
                    cg1 = self._cgs[i]
                elif i < len(self._cgs) + len(reference):
                    cg1 = reference[i - len(self._cgs)]
                else:
                    assert i == len(self._cgs) + len(reference)
                    cg1 = self._reference_cg
                if j < len(self._cgs):
                    cg2 = self._cgs[j]
                elif j < len(self._cgs) + len(reference):
                    cg2 = reference[j - len(self._cgs)]
                else:
                    assert j == len(self._cgs) + len(reference)
                    cg2 = self._reference_cg
                alldists[i, j] = alldists[j, i] = ftms.cg_rmsd(cg1, cg2)
            # Then calculate the 2D coordinates for our embedding
            mds = MDS(n_components=2,
                      dissimilarity="precomputed", random_state=6)
            results = mds.fit(alldists)
            coords = results.embedding_
            # Now plot
            plt.plot(coords[len(self._cgs):len(self._cgs) + len(reference), 0],
                     coords[len(self._cgs):len(self._cgs) + len(reference), 1], 's', color="green")
            plt.plot(coords[:len(self._cgs), 0],
                     coords[:len(self._cgs), 1], '-o', color="blue")
            plt.plot([coords[-1, 0]], [coords[-1, 1]], 's', color="red")
            plt.savefig("embedding1_{}.svg".format(self._cgs[0].name))
            plt.clf()
            plt.close()
def calculate_and_cluster():

    global names
    global data_list
    global data_tag_map
    global matrix_list
    global data_tagged_list

    data_list = {}
    data_tag_map = {}
    data_tagged_list = {}
    matrix_list = []

    counter = 0
    # Parse the CSV file (this will be denoted by a string variable)
    with open('../../../data/sets/complete_set.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            data_list[counter] = ''.join(row)
            counter += 1

    # Loop through data in range
    for data in range(0, len(data_list)):
        # Split the last token in the string
        split = data_list[data].split(" ")[-1:]
        # print split[0], "Tag set: ", get_tag_set(split[0])
        data_tag_map[split[0]] = get_tag_set(split[0])
    od = OrderedDict(sorted(data_tag_map.items()))

    names = []
    counter = 0
    for key, value in od.iteritems():
        # Maintain old file name
        file_old = str(counter) + '.txt'
        tag = ''
        if len(value) == 1:
            tag = 'Tagged'
            names.append(str(counter) + "_" + tag)
            data_tagged_list[str(counter)] = True
        else:
            tag = 'Untagged'
            names.append(str(counter) + "_" + tag)
            data_tagged_list[str(counter)] = False

        # Create new file name with tagged / untagged appended
        file_new = str(counter) + '_' + tag + '.txt'
        # Rename the file for later use in color co-ordination
        rename_file(file_old, file_new)
        counter += 1

    dataNodes = []
    for x in range(0, len(data_list)):
        dataNodes.append(data_list[x])

    # Generate matrix from file
    X = genfromtxt('matrix.csv', delimiter=',')

    # Symmetrize X to ensure the matrix is valid
    X = symmetrize(X)

    # Put matrix in a list for checking
    matrix_list = X.tolist()

    for x in range(0,len(matrix_list)):
        tagged = get_tagged(str(x))
        if(not tagged):
            tag_nearest_neighbour(x)


    # Check symmetry
    print "Symmetric? " + str((X.transpose() == X).all())
    # N Components: plotting points in a two-dimensional plane
    # Dissimilirity: "precomputed" because of the Distance Matrix
    # Random state is fixed so we can reproduce the plot.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    mds.fit(X.astype(np.float64))
    pos = mds.fit_transform(X)  # shape (n_components, n_samples)
    xs, ys = pos[:, 0], pos[:, 1]

    # Set figure size to have dimensions of at least 15 inches for the width.
    # Height can be scaled accordingly.
    plt.figure(figsize=(15, 8))
    plt.subplot(211)

    # Loop through the points, label appropriately and scatter
    # Ensure figure size has enough room for legend plotting. Each plot must have a label.
    # In this case, label is the split value denoting the POI tag
    for x, y, name in zip(xs, ys, names):
        plt.scatter(x, y, s=100, c=get_colour_tag(name.split('_', 1)[1]), label=name.split('_', 1)[1])
        #plt.text(x,y,name.split('_',1)[0])
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys(), loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.6))

    plt.show()

    # Create a denodrogram
    linkage_matrix = ward(X)

    # match dendrogram to that returned by R's hclust()
    dendrogram(linkage_matrix, orientation="right")
    plt.tight_layout()
    plt.show()
예제 #55
0
    def vis_MDS(self, dmatrix, ven_names, fout=None):
        """
        Displays MDS graph of venues.

        :param dmatrix: distance matrix
        :type dmatrix: numpy.ndarray
        :param ven_names: names of venues in matrix
        :type ven_names: list
        :param fout: save graph to file
        :type fout: str
        :return: None
        :rtype: None
        """
        # setup plot figure
        with plt.style.context('ggplot'):
            fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))
            ax.grid(color='white', linestyle='solid', linewidth=2)
            fig = plt.gcf()
            fig.set_dpi(100)
            fig.set_size_inches((8.0, 8.0), forward=True)
            plt.subplots_adjust(left=0.10, bottom=0.10, right=0.95, top=0.95)
            plt.gca().grid(True)
            plt.axis([-1.0, 1.0, -1.0, 1.0])

            ttl = plt.title('MDS: top 30 Venues')

            # get MDS coordinates for venues
            myMDS = MDS(2, verbose=0, n_jobs=-1, dissimilarity='precomputed')
            myMDS.fit(dmatrix)
            points = myMDS.embedding_

            # add column to points to hold venue categories
            points = np.c_[points, np.zeros(len(points))]

            # create high-level categories and manually categorize top 30 venues
            # TODO: use Foursquare's categories json to get higher-level categories for venues
            CONVENTIONS = 2
            THEME_PARKS = 4
            STADIUMS = 5
            AIRPORTS = 9
            airports = [0, 1, 5, 6, 11, 12, 28]
            conventions = [15, 18, 19, 29]
            theme_parks = [2, 8, 17]
            stadiums = [3, 4, 9, 10, 14, 24]
            others = [7, 13, 16, 20, 21, 22, 23, 25, 26, 27]
            for ind in airports:
                points[ind, 2] = AIRPORTS
            for ind in conventions:
                points[ind, 2] = CONVENTIONS
            for ind in theme_parks:
                points[ind, 2] = THEME_PARKS
            for ind in stadiums:
                points[ind, 2] = STADIUMS

            colors = pd.tools.plotting._get_standard_colors(5, color_type='random')

            airports_pts = np.stack([points[ind] for ind in airports])
            conventions_pts = np.stack([points[ind] for ind in conventions])
            theme_parks_pts = np.stack([points[ind] for ind in theme_parks])
            stadiums_pts = np.stack([points[ind] for ind in stadiums])
            others_pts = np.stack([points[ind] for ind in others])

            air = plt.scatter(airports_pts[:, 0], airports_pts[:, 1], marker='o', color=colors[0], s=70,
                              edgecolor='black', linewidth=0.5)
            con = plt.scatter(conventions_pts[:, 0], conventions_pts[:, 1], marker='o', color=colors[1], s=70,
                              edgecolor='black', linewidth=0.5)
            theme = plt.scatter(theme_parks_pts[:, 0], theme_parks_pts[:, 1], marker='o', color=colors[2], s=70,
                                edgecolor='black', linewidth=0.5)
            sta = plt.scatter(stadiums_pts[:, 0], stadiums_pts[:, 1], marker='o', color=colors[3], s=70,
                              edgecolor='black', linewidth=0.5)
            oth = plt.scatter(others_pts[:, 0], others_pts[:, 1], marker='o', color=colors[4], s=70,
                              edgecolor='black', linewidth=0.5)

            # make legend
            legend = plt.legend((air, con, theme, sta, oth),
                                ('Airports', 'Conventions', 'Theme Parks', 'Stadiums', 'Other'),
                                scatterpoints=1,
                                loc='lower left',
                                ncol=2,
                                fontsize=8)
            frame = legend.get_frame()
            frame.set_facecolor('#cccccc')
            frame.set_edgecolor('#909090')

            # make labels as annotations
            for label, x, y in zip(ven_names, points[:, 0], points[:, 1]):
                plt.annotate(
                    label,
                    xy=(x, y), xytext=(0, 5),
                    textcoords='offset points', ha='center', va='bottom',
                    size='xx-small')

            # adjust tick labels
            plt.tick_params(axis='both', which='major', labelsize=6, color='gray')
            plt.tick_params(axis='both', which='minor', labelsize=6, color='gray')

            # turn off ticks
            ax = plt.gca()
            for t in ax.xaxis.get_major_ticks():
                t.tick1On = False
                t.tick2On = False
            for t in ax.yaxis.get_major_ticks():
                t.tick1On = False
                t.tick2On = False

            if fout is None:
                plt.show()
            else:
                fig.savefig(fout)
예제 #56
0
    def vis_super_MDS(self, dmatrix, fout=None):
        """
        Displays MDS graph of lots of venues (1200)

        :param dmatrix: distance matrix
        :type dmatrix: numpy.ndarray
        :param fout: save graph to file
        :type fout: str
        :return: None
        :rtype: None
        """
        with plt.style.context('ggplot'):
            # setup plot figure
            fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))
            ax.grid(color='white', linestyle='solid', linewidth=2)
            fig = plt.gcf()
            fig.set_dpi(100)
            fig.set_size_inches((8.0, 8.0), forward=True)
            plt.subplots_adjust(left=0.10, bottom=0.10, right=0.95, top=0.95)
            plt.gca().grid(True)
            plt.axis([-1.0, 1.0, -1.0, 1.0])

            ttl = plt.title('MDS: top 1200 Venues')

            # get MDS coordinates for venues
            myMDS = MDS(2, verbose=0, n_jobs=-1, dissimilarity='precomputed')
            myMDS.fit(dmatrix)
            points = myMDS.embedding_

            # add column to points to hold venue categories
            # TODO: higher-level categories from Foursquare's categories json
            points = np.c_[points, np.zeros(len(points))]
            categories = sq.get_categories(9)
            cat2num = {}
            for ind, cat in enumerate(categories):
                cat2num[cat] = ind
            for i, point in enumerate(points):
                point[2] = cat2num[self.vens[i].cat_name]

            # plot the points
            plt.scatter(points[:, 0], points[:, 1], marker='o', facecolor=points[:, 2], s=30,
                        cmap=plt.get_cmap('jet'), edgecolor='black', linewidth=0.5, alpha=0.6)

            # TODO: make labels as mouse-overs or something like that
            # for label, x, y in zip(ven_names, points[:, 0], points[:, 1]):
            #     plt.annotate(
            #         label,
            #         xy=(x, y), xytext=(0, 5),
            #         textcoords='offset points', ha='center', va='bottom',
            #         size='x-small')

            # adjust tick labels
            plt.tick_params(axis='both', which='major', labelsize=6, color='gray')
            plt.tick_params(axis='both', which='minor', labelsize=6, color='gray')

            # turn off ticks
            ax = plt.gca()
            for t in ax.xaxis.get_major_ticks():
                t.tick1On = False
                t.tick2On = False
            for t in ax.yaxis.get_major_ticks():
                t.tick1On = False
                t.tick2On = False

            if fout is None:
                plt.show()
            else:
                fig.savefig(fout)
예제 #57
0
def get_twodim_reps(reps, seed, distance=euclidean_distances):
    reps = reps.astype(np.float64)
    similarities = distance(reps)
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=seed)
    return mds.fit(similarities).embedding_