Exemplo n.º 1
0
def main(argv):
        inputfile = ''
        outputfile = ''
        hookfile = ''
        try:
                opts, args = getopt.getopt(argv,"hi:o:t:",["ifile=","ofile=","transformation="])
        except getopt.GetoptError:
                print sys.argv[0] + ' -i inputfile -o outputfile -t [hook script]'
                print sys.argv[0] + ' -i sample.pcap -o result.pcap -t example'
                sys.exit(2)
        for opt, arg in opts:
                if opt == '-h':
                        print 'test.py -i <inputfile> -o <outputfile>'
                        sys.exit()
                elif opt in ("-i", "--ifile"):
                        inputfile = arg
                elif opt in ("-o", "--ofile"):
                        outputfile = arg
                elif opt in ("-t", "--transformation"):
                        hookfile = arg
        
        #copy packet data to dictionary object
        pktdict={}
        pkts=rdpcap(inputfile)
        i=0
        for pkt in pkts:
            try:
                my_array = []
                if pkt.haslayer(TCP):
                    for d in str(pkt.getlayer(TCP).payload):
                        my_array.append(d)
                if pkt.haslayer(UDP):
                    for d in str(pkt.getlayer(UDP).payload):
                        my_array.append(d)

                #reverse packet for backtrace on needleman wunch
                pktdict[i] = list("".join(reversed(my_array)))
                i=i+1
            except:
                raise

        #Create distance matrix
        dictSize = len(pktdict)
        diffMatrix = zeros((packetsToSample,packetsToSample))
        
        x=0
        while x < packetsToSample:
                y=0
                print ""
                print "Packet " + str(x) + ": " + str(pktdict[x])
                while y < packetsToSample:
                        #calculate common substring length between packets
                        #similarity = lcs(pktdict[x], pktdict[y])
                        gms, similarity, distance, alignedseq1Discard, alignedseq2Discard = sequencealignment(pktdict[x], pktdict[y])
                        #distance = 1 - (similarity + 1)/2
                        print "Packet " + str(x) + " similarity to packet " + str(y) + " = " + str(similarity)
                        print "Packet " + str(x) + " distance from packet " + str(y) + " = " + str(distance)
                        
                        #assign value to symmetrically opposite cells
                        #as Smith-Waterman score follows triangle equality rule
                        diffMatrix[x][y]=distance
                        diffMatrix[y][x]=distance
                        y=y+1
                x=x+1
        
        print " "
        print "Distance Matrix:"
        print diffMatrix
        print ""
        
        #Multi-Dimensional Scaling from distances to XY points
        #
        # Source: http://scikit-learn.org/stable/auto_examples/manifold/plot_mds.html#example-manifold-plot-mds-py
        #
        seed = np.random.RandomState(seed=3)
        mds = manifold.MDS(n_components=2, max_iter=1000, eps=0.8, random_state=seed, dissimilarity="precomputed", n_jobs=1)
        pos = mds.fit(diffMatrix).embedding_
        pos = manifold.MDS(dissimilarity="precomputed").fit_transform(diffMatrix)
        pos *= np.sqrt(100000) / np.sqrt((pos ** 2).sum())
        clf = PCA(n_components=2)
        pos = clf.fit_transform(pos)
        
        #Display distance matrix
        print "Coordinates of plotted packets: "
        for p in pos:
            print p.astype(int)
        
        #Calculate number of clusters
        #
        # Source: http://scikit-learn.org/stable/auto_examples/cluster/plot_mean_shift.html
        #
        print ""
        bandwidth = estimate_bandwidth(pos, quantile=0.2, n_samples=500)
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
        ms.fit(pos)
        labels = ms.labels_
        cluster_centers = ms.cluster_centers_
        labels_unique = np.unique(labels)
        n_clusters_ = len(labels_unique)
        print("Estimated number of clusters (k): %d" % n_clusters_)
        
        #Plot on graph
        pl.figure(1)
        pl.clf()
        colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
        for k, col in zip(range(n_clusters_), colors):
            my_members = labels == k
            cluster_center = cluster_centers[k]
            pl.plot(pos[my_members, 0], pos[my_members, 1], col + '.')
            print ""
            print "Cluster: " + str(k)
            #print str(my_members)
            
            #Create GMS for cluster using Needleman-Wunch
            #
            #This section is not part of the clustering code
            #
            clusterPackets = [];
            origionalClusterPackets = [];
            offset = 0;
            #extract packets from each cluster
            for val in my_members:
                if(str(val) == "True"):
                    clusterPackets.append(pktdict[offset]);
                offset += 1;
            origionalClusterPackets = copy.deepcopy(clusterPackets);
            
            print 'Compressing GMS .',
            #compress all GMS pairs to single GMS for the cluster
            while len(clusterPackets) > 1:
                print '.',
                gmsList1 = [];
                #calculate generic message sequence for each pair of messages
                for i in xrange(len(clusterPackets) - 1):
                    current_item, next_item = clusterPackets[i], clusterPackets[i + 1]
                    gms, totalMatch, totalDifference, alignedseq1Discard, alignedseq2Discard = sequencealignment(current_item, next_item)
                    gmsList1.append(gms)
                clusterPackets = copy.deepcopy(gmsList1)
            print ""
            gmspkt = list(reversed(clusterPackets[0]))
            #gmsbin = str("".join(gmspkt))
            
            #compress all substitution characters to a single character
            beforeGmsLen = len(gmspkt)+1
            afterGmsLen = len(gmspkt)
            while(beforeGmsLen > afterGmsLen):
                beforeGmsLen = len(gmspkt)
                for i in xrange(len(gmspkt) - 1, 0, -1):
                    if(gmspkt[i] == "-" and gmspkt[i-1] == "-"):
                        del gmspkt[i]
                afterGmsLen = len(gmspkt)
            print str(gmspkt)
            #print list(reversed(gmspkt))
            print ""
            #enumerate ngrams in variable data
            clusterTokens = [];
            for clusPkt in origionalClusterPackets:
                gmsDiscard, similarityDiscard, distanceDiscard, alignedseq1Keep, alignedseq2Keep = sequencealignment(clusPkt, list(reversed(gmspkt)))
                pktAlignedGMS1 = list(reversed(alignedseq1Keep))
                GMSAlignedData = list(reversed(alignedseq2Keep))
                tmpData = copy.deepcopy(GMSAlignedData)
                packettokens = [];
                packettoken = [];
                gmsoffset = 0
                while gmsoffset < len(GMSAlignedData):
                    if pktAlignedGMS1[gmsoffset]:
                        if(pktAlignedGMS1[gmsoffset] != "-"):
                            tmpData[gmsoffset] = "-"
                    gmsoffset+=1;
                packettokens = copy.deepcopy(tmpData)
                
                splittoken = [];
                splittokens = [];
                gmsoffset = 0
                while gmsoffset < len(packettokens):
                    if packettokens[gmsoffset]:
                        if(packettokens[gmsoffset] != "-"):
                            splittoken.append(copy.deepcopy(packettokens[gmsoffset]))
                        if(gmsoffset+1 < len(packettokens)):
                            if(packettokens[gmsoffset+1] == "-"):
                                if(len(splittoken) > 0):
                                    #TODO:
                                    # having problems with passing by reference
                                    # the beginning of the list vanishes.
                                    splittokens.append(copy.deepcopy(splittoken))
                                    del splittoken[:]
                    gmsoffset+=1
                
                clusterTokens.append(splittokens)
                
                print
                print "GMS: " + str(pktAlignedGMS1)
                print "Data: " + str(GMSAlignedData)
                print "Masked Data: " + str(tmpData)
                print "Tokens: " + str(packettokens)
                print "Split Tokens: " + str(splittokens)
                print
                
            print ""
            
            #infer token data type
            #integer, float, character, string
            fieldtype = "Blob"
            for tokens in clusterTokens:
                for token in tokens:
                    singleToken = ''.join(token)
                    newToken = str(singleToken)
                    if newToken == 'True' or newToken == 'False':
                        fieldtype = "Flag"
                    else:
                        try:
                            int(newToken)
                            fieldtype = "Number"
                        except ValueError:
                            try:
                                float(newToken)
                                fieldtype = "Number"
                            except ValueError:
                                fieldtype = "Blob"
            
            chunkOffset = 0;
            staticFieldBuff = [];
            fieldSwitch = 0;
            fieldLength = 1;
            print '<DataModel name="cluster' + str(k) + '">'
            while chunkOffset < len(gmspkt):
                
                #print conditions
                if gmspkt[chunkOffset]:
                    if(gmspkt[chunkOffset] != "-"):
                        staticFieldBuff.append(gmspkt[chunkOffset])
                    if(chunkOffset+1 < len(gmspkt)):
                        if(gmspkt[chunkOffset] != "-" and gmspkt[chunkOffset+1] == "-"):
                            #print '<Blob valueType="hex" value="' + str(staticFieldBuff).replace("[", "").replace("]", "").replace("'", "").replace("\\x", "") + '" mutable="false"/>'
                            print '<Blob valueType="hex" value="',
                            for c in staticFieldBuff:
                                print binascii.hexlify(c),
                            print '" mutable="false"/>'
                            del staticFieldBuff[:]
                           
                    if(chunkOffset == 0 and gmspkt[chunkOffset] == "-"):
                        fieldSwitch = 1
                    if(chunkOffset-1 > 0):
                        if(gmspkt[chunkOffset] == "-" and gmspkt[chunkOffset-1] != "-"):
                            fieldSwitch = 1
                    
                if(fieldSwitch == 1):
                    print '<' + str(fieldtype) + ' mutable="true"/>'
                    fieldLength=0
                    fieldSwitch = 0
                    
                chunkOffset+=1
                fieldLength+=1
                
            print '</DataModel>'
            
            
            pl.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=8)
        pl.title('Estimated number of clusters: %d' % n_clusters_)
        pl.show()
Exemplo n.º 2
0
    ax.yaxis.set_major_formatter(NullFormatter())
    plt.axis('tight')

t0 = time()
Y = manifold.Isomap(n_neighbors, n_components).fit_transform(X)
t1 = time()
print("Isomap: %.2g sec" % (t1 - t0))
ax = fig.add_subplot(257)
plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
plt.title("Isomap (%.2g sec)" % (t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')

t0 = time()
mds = manifold.MDS(n_components, max_iter=100, n_init=1)
Y = mds.fit_transform(X)
t1 = time()
print("MDS: %.2g sec" % (t1 - t0))
ax = fig.add_subplot(258)
plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
plt.title("MDS (%.2g sec)" % (t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')

t0 = time()
se = manifold.SpectralEmbedding(n_components=n_components,
                                n_neighbors=n_neighbors)
Y = se.fit_transform(X)
t1 = time()
Exemplo n.º 3
0
 def perform(self):
     mds = skm.MDS(n_components=self.n_components, random_state=self.random_state)
     transform = mds.fit_transform(self.data)
     return transform
# Making Rips simplicial complex
rc = gudhi.RipsComplex(distance_matrix=df, max_edge_length=0.005)
st = rc.create_simplex_tree(max_dimension=2)

# We are only going to plot the triangles, edges and points
triangles = np.array([s[0] for s in st.get_skeleton(2) if len(s[0]) == 3])
duzi = np.array([s[0] for s in st.get_skeleton(1) if len(s[0]) == 2])
tacke = np.array([s[0] for s in st.get_skeleton(0) if len(s[0]) == 1])
print(triangles)
print()
print(duzi)
print()
print(tacke)

# Making 3D coordinates out of distance matrix
mds = manifold.MDS(n_components=3, dissimilarity="precomputed", random_state=6)
results = mds.fit(df)
coords = results.embedding_

fig = plt.figure()
ax = fig.gca(projection='3d')

# Ploting points and naming them
plt.scatter(coords[:, 0], coords[:, 1], coords[:, 2])
for label, x, y, z in zip(likovi, coords[:, 0], coords[:, 1], coords[:, 2]):
    ax.text(x, y, z, label)

# Ploting edges
if 0 != len(duzi):
    points = np.array(coords)
    edges = np.array(duzi)
Exemplo n.º 5
0
        files += dir_files
        # for file in files:
        #     vectors.append(dict_from_file(file))
    diss = np.ndarray(shape=(sum(lengths.values()), sum(lengths.values())),
                      dtype=np.float32)
    queries = []
    for index, file in enumerate(files):
        diss[index, index] = 0
        for index2 in range(index + 1, len(files)):
            queries.append([index, index2, file, files[index2], args.z])
    pool = Pool(os.cpu_count())
    results = pool.starmap(dist_between_files, queries, chunksize=1)
    for result in results:
        diss[result[0], result[1]] = result[2]
        diss[result[1], result[0]] = result[2]
    mds = manifold.MDS(dissimilarity='precomputed')
    coords = mds.fit(diss).embedding_
    if args.no_draw:
        # Data exist, but need to be dumped, not stored
        with open(args.data_filename + '.lengths', mode='w') as lenfile:
            for x in lengths:
                print(x + '\t' + str(lengths[x]), file=lenfile)
        np.savetxt(args.data_filename + '.coords', coords)
        print('Data written to {}'.format(args.data_filename))

if not args.no_draw:
    # If something is to be drawn, `lengths` and `coords` should be set.
    if not lengths and not coords:
        lengths = OrderedDict()
        for line in open(args.data_filename + '.lengths'):
            l = line.split('\t')
Exemplo n.º 6
0
def do_scree_plot(A):
    num_vars = 191
    U, S, V = np.linalg.svd(A)
    eigvals = S**2 / np.cumsum(S)[-1]

    fig = plt.figure(figsize=(8,5))
    sing_vals = np.arange(num_vars) + 1
    sing_vals = sing_vals[:9]
    eigvals = eigvals[:9]
    # getting rid of the first one
    sing_vals = sing_vals[1:]
    eigvals = eigvals[1:]

    plt.plot(sing_vals, eigvals, 'ro-', linewidth=2)
    plt.title('Scree Plot')
    plt.xlabel('Principal Component')
    plt.ylabel('Eigenvalue')
    leg = plt.legend(['Eigenvalues from SVD'], loc='best', borderpad=0.3,
                     shadow=False, prop=matplotlib.font_manager.FontProperties(size='small'),
                     markerscale=0.4)
    leg.get_frame().set_alpha(0.4)
    leg.draggable(state=True)
    plt.show()


fold = manifold.MDS(n_components=2, dissimilarity='precomputed')
fold.fit_transform(A)
do_scree_plot(A)
#print fold.stress_
Exemplo n.º 7
0
    def silhouette(self, range_n_clusters, cluster_labelss):
        X = self.ndf
        for n_cluster in range_n_clusters:
            fig, (ax1, ax2) = plt.subplots(1, 2)
            fig.set_size_inches(12, 6)

            ax1.set_xlim([-0.1, 1])
            ax1.set_ylim([0, len(X) + (n_cluster + 1) * 10])

            cluster_labels = cluster_labelss[n_cluster - 2]

            # categories, cluster_labels, cluster_centers_, summary = self.kmeans_fit_predict(n_cluster, preproc)

            silhouette_avg = silhouette_score(X, cluster_labels)
            print("For n_clusters =", n_cluster,
                  "The average silhouette_score is :", silhouette_avg)

            # Compute the silhouette scores for each sample
            sample_silhouette_values = silhouette_samples(X, cluster_labels)

            y_lower = 10
            for i in range(n_cluster):
                ith_cluster_silhouette_values = \
                    sample_silhouette_values[cluster_labels == i]

                ith_cluster_silhouette_values.sort()

                size_cluster_i = ith_cluster_silhouette_values.shape[0]
                y_upper = y_lower + size_cluster_i

                color = cm.spectral(float(i) / n_cluster)
                ax1.fill_betweenx(np.arange(y_lower, y_upper),
                                  0,
                                  ith_cluster_silhouette_values,
                                  facecolor=color,
                                  edgecolor=color,
                                  alpha=0.7)

                ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

                y_lower = y_upper + 10  # 10 for the 0 samples

            ax1.set_title("The silhouette plot for the various clusters.")
            ax1.set_xlabel("The silhouette coefficient values")
            ax1.set_ylabel("Cluster label")

            # The vertical line for average silhouette score of all the values
            ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

            ax1.set_yticks([])  # Clear the yaxis labels / ticks
            ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

            # mds
            # mds
            similarities = euclidean_distances(X)
            mds = manifold.MDS(n_components=2,
                               max_iter=3000,
                               eps=1e-9,
                               random_state=random_state,
                               dissimilarity="precomputed",
                               n_jobs=1)
            pos = mds.fit(similarities).embedding_
            df_pos = pd.DataFrame(pos, columns=["comp1", "comp2"])
            df_pos["pred"] = cluster_labels

            for i in range(n_cluster):
                color = cm.spectral(float(i) / n_cluster)
                ax2.scatter(df_pos[df_pos["pred"] == i].iloc[:, 0],
                            df_pos[df_pos["pred"] == i].iloc[:, 1],
                            c=color)

            ax2.set_title("The visualization of the clustered data.")
            ax2.set_xlabel("Feature space for the 1st MDS feature")
            ax2.set_ylabel("Feature space for the 2nd MDS feature")

            plt.suptitle(
                ("Silhouette analysis for KMeans clustering on sample data "
                 "with n_clusters = %d" % n_cluster),
                fontsize=14,
                fontweight='bold')
            # end mds
            plt.show()
Exemplo n.º 8
0
def visualize(df, cluster_labels, n_clusters, n_iterations):
    """ Visualize the points in a n-dimensional space and the silhouette for each cluster"""
    # Dimension for visualization
    target_dimension = 2
    cluster_labels = np.array(cluster_labels)
    mds = manifold.MDS(target_dimension, max_iter=100, n_init=1)
    X = mds.fit_transform(df)

    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    ax1.set_xlim([-0.1, 1])

    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(df, cluster_labels)

    y_lower = 10
    num_elements = len(cluster_labels)
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them

        ith_cluster_silhouette_values = np.array([
            sample_silhouette_values[k] for k in range(num_elements)
            if cluster_labels[k] == i
        ])

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0,
                          ith_cluster_silhouette_values,
                          facecolor=color,
                          edgecolor=color,
                          alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    # ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(X[:, 0],
                X[:, 1],
                marker='.',
                s=30,
                lw=0,
                alpha=0.7,
                c=colors,
                edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = {} and n_iterations = {}").format(
                      n_clusters, n_iterations + 1),
                 fontsize=14,
                 fontweight='bold')

    plt.show()
Exemplo n.º 9
0
def clusters(master, model_name, fld_save, D=2, use_bias=True, n_batch=1):

    n_sample = BATCH_SIZE * n_batch
    method = 'MDS'
    #method = 'tSNE'
    #method = 'isomap'

    latent_d = dict()
    colors = {
        'base_conv': 'y',
        'base_resp': 'r',
        'bias_conv': 'k',
        'bias_nonc': 'b',
    }

    print('building data...')
    d_inp_enc = master.dataset.feed_data('test',
                                         max_n=n_sample,
                                         check_src=True,
                                         mix_ratio=(0., 1.))['inp_enc']
    latent_d['base_conv'] = master.model_encoder['S2S'].predict(
        d_inp_enc['ctxt'])
    if use_bias and 'AE' in master.prefix:
        latent_d['bias_nonc'] = master.model_encoder['AE'].predict(
            d_inp_enc['nonc'])
    #if use_bias and 'bias_conv' in master.dataset.files['test']:
    #	d_inp_enc = master.dataset.feed_data('test', max_n=n_sample, check_src=True, mix_ratio=(1.,0.))['inp_enc']
    #	latent_d['bias_conv'] = master.model_encoder['S2S'].predict(d_inp_enc['ctxt'])
    #else:
    d_inp_enc = master.dataset.feed_data('test',
                                         max_n=n_sample,
                                         check_src=True,
                                         mix_ratio=(0., 0.))['inp_enc']
    if 'AE' in master.prefix:
        #latent_d['base_nonc'] = master.model_encoder['AE'].predict(d_inp_enc['nonc'])
        latent_d['base_resp'] = master.model_encoder['AE'].predict(
            d_inp_enc['resp'])

    labels = list(sorted(latent_d.keys()))
    fname_suffix = args.restore.split('/')[-1].replace('.npz', '')
    if use_bias:
        fname_suffix += '_wbias'
    n_labels = len(labels)
    latent = np.concatenate([latent_d[k] for k in labels], axis=0)
    print('latent.shape', latent.shape)

    print('plotting bit hist...')
    bins = np.linspace(-1, 1, 31)
    for k in latent_d:
        l = latent_d[k].ravel()
        freq, _, _ = plt.hist(l, bins=bins, color='w')
        plt.plot(bins[:-1], 100. * freq / sum(freq), colors[k] + '.-')
    plt.ylim([0, 50])
    plt.savefig(fld_save + '/hist_%s.png' % fname_suffix)
    plt.close()

    print('plotting dist mat...')
    d_norm = np.sqrt(latent.shape[1])
    f, ax = plt.subplots()
    cax = ax.imshow(dist_mat(latent) / d_norm, cmap='bwr')
    #ax.set_title(model_name)
    f.colorbar(cax)

    ticks = []
    ticklabels = []
    n_prev = 0
    for i in range(n_labels):
        ticks.append(n_prev + n_sample / 2)
        ticklabels.append(labels[i] + '\n')
        ticks.append(n_prev + n_sample)
        ticklabels.append('%i' % (n_sample * (i + 1)))
        n_prev = n_prev + n_sample
    ax.set_xticks(ticks)
    ax.set_xticklabels(ticklabels)
    ax.xaxis.tick_top()
    ax.set_yticks(ticks)
    ax.set_yticklabels([s.strip('\n') for s in ticklabels])

    plt.savefig(fld_save + '/dist_%s.png' % fname_suffix)
    plt.close()

    if method == 'tSNE':
        approx = manifold.TSNE(init='pca', verbose=1).fit_transform(latent)
    elif method == 'MDS':
        approx = manifold.MDS(D, verbose=1, max_iter=500,
                              n_init=1).fit_transform(latent)
    elif method == 'isomap':
        approx = manifold.Isomap().fit_transform(latent)
    else:
        raise ValueError

    f, ax = plt.subplots()
    for k in labels:
        ax.plot(np.nan, np.nan, colors[k] + '.', label=k)

    jj = list(range(approx.shape[0]))
    np.random.shuffle(jj)
    for j in jj:
        i_label = int(j / n_sample)
        ax.plot(approx[j, 0], approx[j, 1], colors[labels[i_label]] + '.')

    #plt.legend(loc='best')
    plt.title(model_name)
    #ax.set_xticks([])
    #ax.set_yticks([])
    plt.savefig(fld_save + '/%s_%s.png' % (method, fname_suffix))
    plt.show()
def plot_iris_mds():
    iris = datasets.load_iris()
    X = iris.data
    y = iris.target

    # MDS

    fig = pylab.figure(figsize=(10, 4))

    ax = fig.add_subplot(121, projection='3d')
    # ax.set_axis_bgcolor('white')

    mds = manifold.MDS(n_components=3)
    Xtrans = mds.fit_transform(X)

    for cl, color, marker in zip(np.unique(y), colors, markers):
        ax.scatter(
            Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker,
            edgecolor='black')
    pylab.title("MDS on Iris data set in 3 dimensions")
    ax.view_init(10, -15)

    mds = manifold.MDS(n_components=2)
    Xtrans = mds.fit_transform(X)

    ax = fig.add_subplot(122)
    for cl, color, marker in zip(np.unique(y), colors, markers):
        ax.scatter(
            Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')
    pylab.title("MDS on Iris data set in 2 dimensions")

    filename = "mds_demo_iris.png"
    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")

    # PCA

    fig = pylab.figure(figsize=(10, 4))

    ax = fig.add_subplot(121, projection='3d')
    # ax.set_axis_bgcolor('white')

    pca = decomposition.PCA(n_components=3)
    Xtrans = pca.fit(X).transform(X)

    for cl, color, marker in zip(np.unique(y), colors, markers):
        ax.scatter(
            Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker,
            edgecolor='black')
    pylab.title("PCA on Iris data set in 3 dimensions")
    ax.view_init(50, -35)

    pca = decomposition.PCA(n_components=2)
    Xtrans = pca.fit_transform(X)

    ax = fig.add_subplot(122)
    for cl, color, marker in zip(np.unique(y), colors, markers):
        ax.scatter(
            Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')
    pylab.title("PCA on Iris data set in 2 dimensions")

    filename = "pca_demo_iris.png"
    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
Exemplo n.º 11
0
    def renderD3(self, enc_email_addr):

        try:
            email = base64.b64decode(enc_email_addr)
            domain_list = self.load(email)
            count = len(domain_list)

            K = 3

            # compute distance matrix for all domains
            similarity = []

            # compute distance between two domains
            def domain_similarity(s1, s2):

                if len(s1) > len(s2):
                    s1, s2 = s2, s1
                distances = range(len(s1) + 1)

                for i2, c2 in enumerate(s2):
                    distances_ = [i2 + 1]
                    for i1, c1 in enumerate(s1):
                        if c1 == c2:
                            distances_.append(distances[i1])
                        else:
                            distances_.append(1 + min((distances[i1],
                                                       distances[i1 + 1],
                                                       distances_[-1])))
                    distances = distances_
                return distances[-1]

            # clustering all points according to given centroid
            def cluster_points(X, mu):
                clusters = {}
                for x in X:
                    bestmukey = min([(i[0], np.linalg.norm(x - mu[i[0]])) \
                                     for i in enumerate(mu)], key=lambda t: t[1])[0]
                    try:
                        clusters[bestmukey].append(x)
                    except KeyError:
                        clusters[bestmukey] = [x]
                return clusters

            # relocate centroids
            def reevaluate_centers(mu, clusters):
                newmu = []
                keys = sorted(clusters.keys())
                for key in keys:
                    newmu.append(np.mean(clusters[key], axis=0))
                return newmu

            # check convergence of centroids
            def has_converged(mu, oldmu):
                return (set([tuple(a)
                             for a in mu]) == set([tuple(a) for a in oldmu]))

            # find stable centroids
            def find_centroids(X, k):
                # Initialize to K random centers
                oldmu = random.sample(X, k)
                mu = random.sample(X, k)
                while not has_converged(mu, oldmu):
                    oldmu = mu
                    # Assign all points in X to clusters
                    clusters = cluster_points(X, mu)
                    # Reevaluate centers
                    mu = reevaluate_centers(oldmu, clusters)
                return (mu, clusters)

            # Euclidean distance
            def Eu_distance(P1, P2):
                dist = np.sqrt(
                    pow((P1[0] - P2[0]), 2) + pow((P1[1] - P2[1]), 2))
                return dist

            # Find corresponding domain name for given coordinates
            def find_domain(coordinates):
                for m in range(0, count):
                    if coordinates[0] == coords[m][0] and coordinates[
                            1] == coords[m][1]:
                        return str(domain_list[m])

            def find_result(X, k):
                (M, C) = find_centroids(X, k)

                # change to integer coordinates
                for l in range(0, k):
                    for point_index in range(0, len(C[l])):
                        C[l][point_index] = [
                            int(C[l][point_index][0]),
                            int(C[l][point_index][1]),
                            find_domain(C[l][point_index])
                        ]

                # find acutal center
                for i in range(0, k):
                    dis_array = []
                    for point in C[i]:
                        dis_array.append(Eu_distance(point, M[i]))
                    index = dis_array.index(min(dis_array))
                    # Store center
                    center_point = C[i].pop(index)
                    C[i].insert(0, center_point)
                    C[str(i)] = C.pop(i)

                return C

            for count_index1 in range(0, count):
                tmp = []
                for count_index2 in range(0, count):
                    if count_index1 == count_index2:
                        simi = 0
                    elif count_index1 < count_index2:
                        simi = domain_similarity(domain_list[count_index1],
                                                 domain_list[count_index2])
                    else:
                        simi = similarity[count_index2][count_index1]
                    tmp.append(simi)
                similarity.append(tmp)

            # scale the distance matrix
            adist = np.array(similarity)
            adist = adist * 10

            # compute coordinates matrix
            mds = manifold.MDS(n_components=2,
                               dissimilarity="precomputed",
                               random_state=6)
            results = mds.fit(adist)
            coords = results.embedding_

            output_data = find_result(coords, K)

            return {"data": json.dumps(output_data), "email": email}

        except:
            return {"data": "null", "email": "Something's wrong with the URL!"}
Exemplo n.º 12
0
    total_pt = []
    total_data = []
    for i in range(len(GPARAMS.Esoinn_setting.Model.learn_history_node)):
        total_pt.append([])
        total_pt[-1].append(len(total_data))
        total_data = total_data + list(
            GPARAMS.Esoinn_setting.Model.learn_history_node[i])
        total_pt[-1].append(len(total_data))
    total_pt.append([])
    total_pt[-1].append(len(total_data))
    total_data += list(GPARAMS.Esoinn_setting.Model.nodes)
    total_pt[-1].append(len(total_data))
    similarities = euclidean_distances(np.array(total_data))
    mds = manifold.MDS(n_components=2,
                       max_iter=500,
                       eps=1e-7,
                       dissimilarity="precomputed",
                       n_jobs=GPARAMS.Compute_setting.Ncoresperthreads)
    pos = mds.fit(similarities).embedding_
    total_2D_data = []
    for i in range(len(GPARAMS.Esoinn_setting.Model.learn_history_node)):
        total_2D_data.append([])
        total_2D_data[-1] = pos[total_pt[i][0]:total_pt[i][1]]
    total_2D_data.append([])
    total_2D_data[-1] = pos[total_pt[-1][0]:total_pt[-1][1]]
    with open("ESOI-Layer.History", 'wb') as file:
        pickle.dump(total_2D_data, file)
else:
    with open("ESOI-Layer.History", 'rb') as file:
        total_2D_data = pickle.load(file)
Exemplo n.º 13
0
        return 1


# create Input class instance and vectorize quantizer (note: vectorizing
# quantizer is default behavior but can be set to False if quantizer already vectorized)
data_class = Input(data=X,
                   is_categorical=True,
                   is_synchronized=True,
                   preproc=q)

# decide on number of dimensions to use (default is 2)
num_dim = 2

# instantiate another embedding class if desired
# e.g. sklearn.manifold.MDS
mds_emb = manifold.MDS(n_components=num_dim, dissimilarity="precomputed")

# create SmashEmbedding class to run methods (require 2 dimensions in embedding)
sec = SmashEmbedding(bin_path=bin_path,
                     input_class=data_class,
                     n_dim=num_dim,
                     embed_class=mds_emb)

# return distance matrix of input timeseries data (repeat calculation 3 times)
# NOTE: fits both default Sippl Embedding and user-defined custom embedding class
print(sec.fit(nr=3))

# return embedded coordinates using Sippl embedding (default) on distance matrix
print(sec.fit_transform(nr=3, embedder='default'))

# return embedded coordinates using Sippl embedding (default) on distance matrix
Exemplo n.º 14
0
import matplotlib.pyplot as plt

import mpl_toolkits.mplot3d as plt3d

#dotpath='../dataset/total_graph.dot'
dotpath = '../dataset/game_of_thrones_consistent.dot'
similarities, G, nodes_index = gsm.get_similarity_matrix(dotpath)

seed = np.random.RandomState(seed=3)
print(nx.info(G))
#similarities,  G, nodes_index = GetSimlarityMatrix(dotpath)

mds = manifold.MDS(n_components=3,
                   max_iter=300,
                   eps=1e-9,
                   random_state=seed,
                   dissimilarity="precomputed",
                   n_jobs=1)
pos = mds.fit(similarities).embedding_

fig = plt.figure()
ax = plt.axes(projection='3d')

X, Y, Z = pos.T[0], pos.T[1], pos.T[2]

color = []

ax.scatter3D(X, Y, Z, c='r', cmap='Greens')

#for e in G.edges():
Exemplo n.º 15
0
def mds():
  print("MDS embedding is selected")
  embedder = manifold.MDS(n_components=n_components, n_init=1, max_iter=100)
  return embedder
Exemplo n.º 16
0
 def __init__(self, dimensions=2, metric=True, clusters=2, **kwargs):
     self.graph_to_points = manifold.MDS(dimensions, metric=metric, dissimilarity='precomputed', **kwargs)
     # self.graph_to_points = manifold.TSNE()
     self.cluster_engine = GaussianMixture(clusters)
Exemplo n.º 17
0
X_lle = clf.fit_transform(X)
plot_embedding(X_lle, "LLE")

clf = manifold.LocallyLinearEmbedding(n_neighbors,
                                      n_components=level,
                                      method='modified')
X_mlle = clf.fit_transform(X)
plot_embedding(X_mlle, "LLE modifiée")

clf = manifold.LocallyLinearEmbedding(n_neighbors,
                                      n_components=level,
                                      method='hessian')
X_hlle = clf.fit_transform(X)
plot_embedding(X_hlle, "LLE Hessian")

clf = manifold.MDS(n_components=level, n_init=20, max_iter=100)
X_mds = clf.fit_transform(X)
plot_embedding(X_mds, "MDS")

hasher = ensemble.RandomTreesEmbedding(n_estimators=100)
X_transformed = hasher.fit_transform(X)
pca = decomposition.TruncatedSVD(n_components=level)
X_reduced = pca.fit_transform(X_transformed)
plot_embedding(X_reduced, "Random forest")

embedder = manifold.SpectralEmbedding(n_components=level)
X_se = embedder.fit_transform(X)
plot_embedding(X_se, "Spectral embedding")
plotly_embedding(X_se, "Spectral embedding")

tsne = manifold.TSNE(n_components=level, init='pca', random_state=0)
Exemplo n.º 18
0
corrFrame = pd.DataFrame(data=corrmatrix,
                         columns=[
                             'Number', 'Team', 'Age', 'Height', 'Weight',
                             'College', 'Country', 'Draft Year', 'Draft Round',
                             'Draft Number', 'GP', 'PTS', 'REB', 'AST',
                             'NetRtg', 'OREB%', 'DREB%', 'USG%', 'TS%', 'AST%'
                         ])

correlationMatrixOfficial = corrFrame.as_matrix()

origData = origDataFrame.as_matrix()

correlationMatrixOfficial = abs(1 - correlationMatrixOfficial)

mds = manifold.MDS(dissimilarity="precomputed")

attrmds = mds.fit(correlationMatrixOfficial).embedding_

np.savetxt("AttributeMds.csv", attrmds, delimiter=',')

mds = manifold.MDS(dissimilarity="euclidean")

origmds = mds.fit(origData).embedding_

np.savetxt("EuclideanMds.csv", origmds, delimiter=',')

corrFrame.to_csv(path_or_buf='correlationMatrix.csv')

pcaPlotXY = [[0.0 for x in range(2)] for y in range(numpyArray.shape[0])]
pylab.scatter(x,y_test8[:n,1],marker='*',s=200,
              color='darkgreen',label='Real data 2')
pylab.plot(x,y_test825[:n,0],lw=2,
           color='steelblue',label='Kernel Ridge 1')
pylab.plot(x,y_test825[:n,1],lw=2,
           color='seagreen',label='Kernel Ridge 2')
pylab.xlabel('Observations'); pylab.ylabel('Targets') 
pylab.title('Kernel Ridge Regressor. Test Results. Toy Regression 2')
pylab.legend(loc=2,fontsize=10); pylab.show()

"""# Unsupervised Learning"""

usl=[mixture.GaussianMixture(n_components=4,n_init=4),
     mixture.BayesianGaussianMixture(n_components=4,n_init=4),
     manifold.Isomap(),manifold.LocallyLinearEmbedding(),
     manifold.SpectralEmbedding(),manifold.MDS(),manifold.TSNE()]
# Gaussian Mixture; Toy blobs
usl[0].fit(X_train9,y_train9); y_test91=usl[0].predict(X_test9)
usl[1].fit(X_train9,y_train9); y_test92=usl[1].predict(X_test9)

pylab.figure(figsize=(12,12))
pylab.scatter(X_test9[:,0],X_test9[:,1],c=y_test9,cmap=pylab.cm.tab10)
pylab.scatter(X_test9[:,0]+0.03,X_test9[:,1]+0.03,
              c=y_test91,alpha=0.4,cmap=pylab.cm.autumn)
pylab.scatter(X_test9[:,0]+0.06,X_test9[:,1]+0.06,
              c=y_test92,alpha=0.4,cmap=pylab.cm.winter)
pylab.scatter([1,-1,1,-1],[1,-1,-1,1],c='black',marker='*',s=150)
pylab.show()

"""# Neural Networks
supervised
Exemplo n.º 20
0
def plot_2D_distance_projection(rmsd_m, clusters_list, colors, logname):
    """
    DESCRIPTION
    This function will create a 2D distance projection graph with the MDS methods
    Args:
        rmsd_m (np.array) : rmsd matrix (between clusters)
        clusters_list (list of Cluster): list of Clusters
    Return:
        None
    """
    labels = range(1, len(clusters_list) + 1)
    # 1 - value normalisation (make value between 0 and 1) of RMSD matrix
    rmsd_norm = rmsd_m / np.max(rmsd_m)
    symmetrize_matrix(rmsd_norm)

    rmsd_norm = symmetrize_matrix(rmsd_norm)
    # 2 - create the MDS methods
    # mds = manifold.MDS(n_components=2, dissimilarity="euclidean", random_state=4)
    mds = manifold.MDS(n_components=2,
                       dissimilarity="precomputed")  # , random_state=2)

    # 3 - MDS projection
    rmsd_mds = mds.fit(rmsd_norm)
    # rmsd_mds = mds.fit(rmsd_m)

    # 4 - get X/Y coords
    coords = rmsd_mds.embedding_

    # 5 - get spread and normalyse
    spreads = []
    for clust in clusters_list:
        spreads.append(clust.spread)
    spreads = np.array(spreads)
    #    spreads_norm = spreads / np.max(spreads)
    # minspread = np.min(spreads_norm)+0.05*np.min(spreads_norm)
    radii = np.pi * (25 * (spreads)**2)  # radii = 5 to 20
    x = coords[:, 0]
    y = coords[:, 1]

    # 6 - plot graph
    fig = plt.figure()
    ax = plt.subplot(111)

    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

    scatter = ax.scatter(x, y, s=radii, c=colors, alpha=0.5)
    for label, x, y in zip(labels, x, y):
        plt.annotate(label, xy=(x, y), ha='left', va='bottom', fontsize=8)

    # set the same axis for X and Y

    lims = []
    lims.extend(ax.get_xlim())
    lims.extend(ax.get_ylim())
    ax.set_ylim((min(lims), max(lims)))
    ax.set_xlim((min(lims), max(lims)))

    plt.title("Relative distance between clusters")
    plt.tick_params(
        axis='x',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
        bottom='off',  # ticks along the bottom edge are off
        top='off',  # ticks along the top edge are off
        labelbottom='off')  # labels along the bottom edge are off
    plt.tick_params(
        axis='y',  # changes apply to the y-axis
        which='both',  # both major and minor ticks are affected
        left="off",
        right="off",
        labelleft='off')  # labels along the bottom edge are off

    # 7 - circle bar
    max_size = max(radii)
    min_size = min(radii)
    min_color = colors[np.argmin(radii)]
    max_color = colors[np.argmax(radii)]

    # add transparency
    min_color[-1] = 0.5
    max_color[-1] = 0.5
    leg_min = plt.scatter([], [],
                          s=min_size,
                          edgecolor='black',
                          color=min_color)
    leg_max = plt.scatter([], [],
                          s=max_size,
                          edgecolor='black',
                          color=max_color)
    labels = ["{:.2f}".format(min(spreads)), "{:.2f}".format(max(spreads))]

    legend = ax.legend([leg_min, leg_max],
                       labels,
                       ncol=1,
                       frameon=False,
                       fontsize=8,
                       handlelength=2,
                       loc="upper right",
                       borderpad=1.8,
                       handletextpad=1,
                       scatterpoints=1,
                       bbox_to_anchor=(1.3, 0.9))
    legend.set_title('Spread radius', prop={"size": "small"})

    # Add Text for distance information
    min_rmsd = np.min(rmsd_m[np.nonzero(rmsd_m)])
    max_rmsd = np.max(rmsd_m[np.nonzero(rmsd_m)])
    text_distance = (
        "RMSD\n   min : {:.2f}$ \AA$\n   max : {:.2f} $\AA$".format(
            min_rmsd, max_rmsd))

    # plt.gca().add_artist(legend1)
    ax.annotate(text_distance,
                xy=(1.05, 0.5),
                xycoords="axes fraction",
                fontsize="small")

    plt.savefig("{0}/{1}-dist.png".format(logname,
                                          logname.split(os.sep)[-1]),
                format="png",
                dpi=DPI,
                transparent=True)
    plt.close()
Exemplo n.º 21
0
algorithms = [
    decomposition.TruncatedSVD, manifold.MDS, manifold.Isomap,
    manifold.LocallyLinearEmbedding, manifold.TSNE
]

fname = sys.argv[1]
algorithm = int(sys.argv[2])
n_comps = int(sys.argv[3])

x = np.loadtxt(fname)

if algorithm == 0:
    model = decomposition.TruncatedSVD(n_components=n_comps)
    X = model.fit_transform(x)
elif algorithm == 1:
    model = manifold.MDS(n_components=n_comps)
    X = model.fit_transform(x)
elif algorithm == 2:
    model = manifold.Isomap(n_components=n_comps)
    X = model.fit_transform(x)
elif algorithm == 3:
    model = manifold.TSNE(n_components=n_comps)
    X = model.fit_transform(x)
elif algorithm == 4:
    n_points, input_size = x.shape
    som_size = int(np.sqrt(n_points) / 2)
    model = MiniSom(som_size,
                    som_size,
                    input_size,
                    sigma=0.9,
                    learning_rate=0.5)
Exemplo n.º 22
0
def k_means(weights, word):

    print('Start Kmeans:')
    true_k = 5
    # Set the parameter of K-means
    clf = KMeans(n_clusters=true_k, max_iter=500,
                 n_init=50)  #need to find a good way to set the K
    s = clf.fit(weights)
    print(s)

    # print centroid points
    print(clf.cluster_centers_)

    # Print clusters for each samples
    label = []
    print(clf.labels_)
    i = 1
    while i <= len(clf.labels_):
        print(i, clf.labels_[i - 1])
        label.append(clf.labels_[i - 1])
        i = i + 1

    # evaluate the number of clusters
    print(clf.inertia_)

    # Print top terms
    print("Top terms:")
    order_centroids = clf.cluster_centers_.argsort()[:, ::-1]
    for i in range(true_k):
        print("Cluster %d:" % i, )
        for ind in order_centroids[i, :10]:
            print(' %s' % word[ind], )
            print(weight[i][ind])
    print()

    #PCA
    #pca = PCA(n_components=3)             # Set the output dimension
    #newData = pca.fit_transform(weights)   # Put the data in
    #print (newData)
    #MDS
    mds = manifold.MDS(n_components=2, dissimilarity='euclidean')
    newData = mds.fit_transform(weights)

    #visualisation
    x1 = []
    y1 = []
    #z1 = []
    i = 1
    while i <= len(clf.labels_):
        if clf.labels_[i - 1] == 0:
            x1.append(newData[i - 1][0])
            y1.append(newData[i - 1][1])
            #z1.append(newData[i-1][2])

        i = i + 1

    x2 = []
    y2 = []
    #z2 = []
    i = 1
    while i <= len(clf.labels_):
        if clf.labels_[i - 1] == 1:
            x2.append(newData[i - 1][0])
            y2.append(newData[i - 1][1])
            #z2.append(newData[i-1][2])

        i = i + 1

    x3 = []
    y3 = []
    #z3 = []
    i = 1
    while i <= len(clf.labels_):
        if clf.labels_[i - 1] == 2:
            x3.append(newData[i - 1][0])
            y3.append(newData[i - 1][1])
            #z3.append(newData[i-1][2])

        i = i + 1

    x4 = []
    y4 = []
    #z4 = []
    i = 1
    while i <= len(clf.labels_):
        if clf.labels_[i - 1] == 3:
            x4.append(newData[i - 1][0])
            y4.append(newData[i - 1][1])
            #z4.append(newData[i-1][2])

        i = i + 1

    x5 = []
    y5 = []
    #z5 = []
    i = 1
    while i <= len(clf.labels_):
        if clf.labels_[i - 1] == 4:
            x5.append(newData[i - 1][0])
            y5.append(newData[i - 1][1])
            #z5.append(newData[i-1][2])

        i = i + 1

    # produce the diagram
    plt.title('K-means Clustering with PCA', fontsize=20)
    plt.xlabel('Dimension 1', fontsize=15)
    plt.ylabel('Dimension 2', fontsize=15)

    plt.plot(x1, y1, 'or')
    plt.plot(x2, y2, 'og')
    plt.plot(x3, y3, 'ob')
    plt.plot(x4, y4, 'ok')
    plt.plot(x5, y5, 'oy')
    #plt.savefig('k-means.png', dpi=500)
    plt.show()
    plt.close()

    return 1
#Operate on random 2.5% sample of headlines
seeder = 1918
random.seed(seeder)
nsamp = 6000
index = random.sample(range(0,len(headlines)),nsamp)
sample = [headlines[i] for i in index]

#Get transformed sparse matrix
sparse = tfidf.fit_transform(sample)

#Calculate matrix of (dis)similarities
similarities = euclidean_distances(sparse)

tic = timeit.default_timer()
#Project via multi-dimensional scaling
mds = manifold.MDS(n_components=2, max_iter=1000, eps=1e-6, random_state=seeder,
                   dissimilarity="precomputed", n_jobs=1)
project = mds.fit(similarities).embedding_

#Varimax rotation
pca = PCA(n_components=2)
project = pca.fit_transform(project)

toc = timeit.default_timer()
timer = '%.2f' %((toc - tic)/60)
logging.info("Time elapsed for MDS projection, %d sample size: %s mins", nsamp, timer)

test = []
for (i,samplehead,row) in zip(index,sample,project2):
	w = [i,samplehead.encode('utf-8')]
	for dim in row:
		w.append(dim)
Exemplo n.º 24
0
def myfunction():
    randomData, stratifiedData, anotherX, targetForStrat, targetForRand, targetForOrg, attributeNames, latitude, longitude, stratLat, stratLong, numberInEachState, avArray = Task1.task1(
    )
    #randData_std, stratData_std, orgData_std, targetForStrat, targetForRand, targetForOrg, attributeNames = Task1.task1()
    # I standardize/center the data --> MAKE SURE IT CENTERS
    stratData_std = StandardScaler().fit_transform(stratifiedData)
    randData_std = StandardScaler().fit_transform(randomData)
    orgData_std = StandardScaler().fit_transform(anotherX)

    #pca = decomposition.PCA(n_components=3)
    pcaStrat = decomposition.PCA()
    pcaRand = decomposition.PCA()
    pcaOrg = decomposition.PCA()

    # I transform the data and get respective eigenvalues
    sklearn_pcaStrat = pcaStrat.fit_transform(stratData_std)
    sklearn_pcaRand = pcaRand.fit_transform(randData_std)
    sklearn_pcaOrg = pcaOrg.fit_transform(orgData_std)

    stratEigVal = pcaStrat.explained_variance_
    randEigVal = pcaRand.explained_variance_
    orgEigVal = pcaOrg.explained_variance_

    sumOfStratEig = 0
    sumOfRandEig = 0
    sumOfOrgEig = 0

    contSumOfStratEig = [None] * 10
    contSumOfRandEig = [None] * 10
    contSumOfOrgEig = [None] * 10

    #calculate sum of all eigenval
    for i in range(0, 10):
        sumOfStratEig = stratEigVal[i] + sumOfStratEig
        sumOfRandEig = randEigVal[i] + sumOfRandEig
        sumOfOrgEig = orgEigVal[i] + sumOfOrgEig

        contSumOfStratEig[i] = sumOfStratEig
        contSumOfRandEig[i] = sumOfRandEig
        contSumOfOrgEig[i] = sumOfOrgEig

    stratVarArray = [None] * 10
    randVarArray = [None] * 10
    orgVarArray = [None] * 10

    #calculate variance array
    for i in range(0, 10):
        stratVarArray[i] = stratEigVal[i] / sumOfStratEig
        randVarArray[i] = randEigVal[i] / sumOfRandEig
        orgVarArray[i] = orgEigVal[i] / sumOfOrgEig

    sumStratVar = 0
    sumRandVar = 0
    sumOrgVar = 0

    #get the sum of variances (total variance)
    for i in range(0, 10):
        sumStratVar = sumStratVar + stratVarArray[i]
        sumRandVar = sumRandVar + randVarArray[i]
        sumOrgVar = sumOrgVar + orgVarArray[i]

    tempStratVarSum = 0
    tempRandVarSum = 0
    tempOrgVarSum = 0

    stratIntrDimCount = 0
    randIntrDimCount = 0
    orgIntrDimCount = 0

    #get when 75% of the total variance occured
    for i in range(0, 10):
        tempStratVarSum = tempStratVarSum + stratVarArray[i]
        tempRandVarSum = tempRandVarSum + randVarArray[i]
        tempOrgVarSum = tempOrgVarSum + orgVarArray[i]
        if ((tempStratVarSum > (sumStratVar * .75))
                and (stratIntrDimCount == 0)):
            stratIntrDimCount = i
        if ((tempRandVarSum > (sumRandVar * .75)) and (randIntrDimCount == 0)):
            randIntrDimCount = i
        if ((tempOrgVarSum > (sumOrgVar * .75)) and (orgIntrDimCount == 0)):
            orgIntrDimCount = i

    #calculate loading factors
    pcaStratNew = decomposition.PCA(n_components=stratIntrDimCount)
    pcaRandNew = decomposition.PCA(n_components=randIntrDimCount)
    pcaOrgNew = decomposition.PCA(n_components=orgIntrDimCount)

    sklearn_pcaStrat = pcaStratNew.fit_transform(stratData_std)
    sklearn_pcaRand = pcaRandNew.fit_transform(randData_std)
    sklearn_pcaOrg = pcaOrgNew.fit_transform(orgData_std)

    stratLoadFact = pcaStratNew.components_.T * np.sqrt(
        pcaStratNew.explained_variance_)
    randLoadFact = pcaRandNew.components_.T * np.sqrt(
        pcaRandNew.explained_variance_)
    orgLoadFact = pcaOrgNew.components_.T * np.sqrt(
        pcaOrgNew.explained_variance_)

    stratSumOfSquaredLoad = [[0 for i in range(0, 2)] for j in range(0, 10)]
    randSumOfSquaredLoad = [[0 for i in range(0, 2)] for j in range(0, 10)]
    orgSumOfSquaredLoad = [[0 for i in range(0, 2)] for j in range(0, 10)]

    #get attributes with highest PCA loading
    for i in range(0, 10):
        for j in range(0, stratIntrDimCount):
            stratSumOfSquaredLoad[i] = stratSumOfSquaredLoad[i] + (
                stratLoadFact[i][j])**2
            stratSumOfSquaredLoad[i][1] = i

    for i in range(0, 10):
        for j in range(0, randIntrDimCount):
            randSumOfSquaredLoad[i] = randSumOfSquaredLoad[i] + (
                randLoadFact[i][j])**2
            randSumOfSquaredLoad[i][1] = i

    for i in range(0, 10):
        for j in range(0, orgIntrDimCount):
            orgSumOfSquaredLoad[i] = orgSumOfSquaredLoad[i] + (
                orgLoadFact[i][j])**2
            orgSumOfSquaredLoad[i][1] = i

    #I sort the arrays
    stratSumOfSquaredLoad.sort(key=lambda x: x[0])
    randSumOfSquaredLoad.sort(key=lambda x: x[0])
    orgSumOfSquaredLoad.sort(key=lambda x: x[0])

    #I get the highest 3 attributes
    stratThreeHighAttr = np.array(stratSumOfSquaredLoad[-3:])
    randThreeHighAttr = np.array(randSumOfSquaredLoad[-3:])
    orgThreeHighAttr = np.array(orgSumOfSquaredLoad[-3:])

    stratThreeHighAttrData = [[0 for i in range(0, 3)] for j in range(0, 250)]
    randThreeHighAttrData = [[0 for i in range(0, 3)] for j in range(0, 250)]
    orgThreeHighAttrData = [[0 for i in range(0, 3)] for j in range(0, 999)]

    #I get the data associated with the three highest attribtues
    for j in range(0, 250):
        for i in range(0, 3):
            stratThreeHighAttrData[j][i] = stratData_std[j][int(
                stratThreeHighAttr[i][1])]
            randThreeHighAttrData[j][i] = randData_std[j][int(
                randThreeHighAttr[i][1])]

    for j in range(0, 999):
        for i in range(0, 3):
            orgThreeHighAttrData[j][i] = orgData_std[j][int(
                orgThreeHighAttr[i][1])]

    #names of the three highest attributes
    stratColumns = [None] * 3
    randColumns = [None] * 3
    orgColumns = [None] * 3

    for i in range(0, 3):
        stratColumns[i] = (attributeNames[int(stratThreeHighAttr[i][1])])
        randColumns[i] = (attributeNames[int(randThreeHighAttr[i][1])])
        orgColumns[i] = (attributeNames[int(orgThreeHighAttr[i][1])])

    strat3Data = pd.DataFrame(data=stratThreeHighAttrData,
                              columns=stratColumns)
    rand3Data = pd.DataFrame(data=randThreeHighAttrData, columns=randColumns)
    org3Data = pd.DataFrame(data=orgThreeHighAttrData, columns=orgColumns)

    targetForStrat2 = pd.DataFrame(data=targetForStrat, columns=['Target'])
    targetForRand2 = pd.DataFrame(data=targetForRand, columns=['Target'])
    targetForOrg2 = pd.DataFrame(data=targetForOrg, columns=['Target'])

    #create the array with data points for 3 attr and cluster associated with that
    strat3DataFinal = pd.concat([strat3Data, targetForStrat2[['Target']]],
                                axis=1)
    rand3DataFinal = pd.concat([rand3Data, targetForRand2[['Target']]], axis=1)
    org3DataFinal = pd.concat([org3Data, targetForOrg2[['Target']]], axis=1)

    #create an array with coordinates for 3 attr scatter plot
    bigStrat3Array = [[0 for i in range(0, 9)] for j in range(0, 250)]
    bigRand3Array = [[0 for i in range(0, 9)] for j in range(0, 250)]
    bigOrg3Array = [[0 for i in range(0, 9)] for j in range(0, 999)]

    for m in range(0, 250):
        count = 0
        for j in range(0, 3):
            for i in range(0, 3):
                bigStrat3Array[m][count] = ([
                    strat3DataFinal.values[m][i], strat3DataFinal.values[m][j]
                ])
                bigRand3Array[m][count] = ([
                    rand3DataFinal.values[m][i], rand3DataFinal.values[m][j]
                ])
                count = count + 1

    for m in range(0, 999):
        count = 0
        for j in range(0, 3):
            for i in range(0, 3):
                bigOrg3Array[m][count] = ([
                    org3DataFinal.values[m][i], org3DataFinal.values[m][j]
                ])
                count = count + 1

    #to visualize data on top 2 pcaVectors
    pcaVisStrat = decomposition.PCA(n_components=2)
    pcaVisRand = decomposition.PCA(n_components=2)
    pcaVisOrg = decomposition.PCA(n_components=2)

    principalDFStrat = pd.DataFrame(
        data=pcaVisStrat.fit_transform(stratData_std),
        columns=['Principal Component 1', 'Principal Component 2'])
    principalDFRand = pd.DataFrame(
        data=pcaVisRand.fit_transform(randData_std),
        columns=['Principal Component 1', 'Principal Component 2'])
    principalDFOrg = pd.DataFrame(
        data=pcaVisOrg.fit_transform(orgData_std),
        columns=['Principal Component 1', 'Principal Component 2'])

    # targetForStrat2 = pd.DataFrame(data=targetForStrat, columns = ['Target'])
    # targetForRand2 = pd.DataFrame(data=targetForRand, columns = ['Target'])
    # targetForOrg2 = pd.DataFrame(data=targetForOrg, columns = ['Target'])
    #print(targetForStrat)
    #last row will show the cluster associated w/ each data point

    finalDFStrat = pd.concat([principalDFStrat, targetForStrat2[['Target']]],
                             axis=1)
    finalDFRand = pd.concat([principalDFRand, targetForRand2[['Target']]],
                            axis=1)
    finalDFOrg = pd.concat([principalDFOrg, targetForOrg2[['Target']]], axis=1)

    #mds
    mds_dataStrat = manifold.MDS(n_components=2, dissimilarity='precomputed')
    mds_dataRand = manifold.MDS(n_components=2, dissimilarity='precomputed')
    mds_dataOrg = manifold.MDS(n_components=2, dissimilarity='precomputed')

    #mds with euclidean
    stratSimEuc = pairwise_distances(stratData_std, metric='euclidean')
    randSimEuc = pairwise_distances(randData_std, metric='euclidean')
    orgSimEuc = pairwise_distances(orgData_std, metric='euclidean')

    stratDEuc = mds_dataStrat.fit_transform(stratSimEuc)
    randDEuc = mds_dataRand.fit_transform(randSimEuc)
    orgDEuc = mds_dataOrg.fit_transform(orgSimEuc)

    stratMDSdatEuc = pd.DataFrame(stratDEuc)
    randMDSdatEuc = pd.DataFrame(randDEuc)
    orgMDSdatEuc = pd.DataFrame(orgDEuc)

    finalMDSStratDataEuc = pd.concat(
        [stratMDSdatEuc, targetForStrat2[['Target']]], axis=1)
    finalMDSRandDataEuc = pd.concat(
        [randMDSdatEuc, targetForRand2[['Target']]], axis=1)
    finalMDSOrgDataEuc = pd.concat([orgMDSdatEuc, targetForOrg2[['Target']]],
                                   axis=1)

    #mds with corr
    stratSimCor = pairwise_distances(stratData_std, metric='correlation')
    randSimCor = pairwise_distances(randData_std, metric='correlation')
    orgSimCor = pairwise_distances(orgData_std, metric='correlation')

    stratDCor = mds_dataStrat.fit_transform(stratSimCor)
    randDCor = mds_dataRand.fit_transform(randSimCor)
    orgDCor = mds_dataOrg.fit_transform(orgSimCor)

    stratMDSdatCor = pd.DataFrame(stratDCor)
    randMDSdatCor = pd.DataFrame(randDCor)
    orgMDSdatCor = pd.DataFrame(orgDCor)

    finalMDSStratDataCor = pd.concat(
        [stratMDSdatCor, targetForStrat2[['Target']]], axis=1)
    finalMDSRandDataCor = pd.concat(
        [randMDSdatCor, targetForRand2[['Target']]], axis=1)
    finalMDSOrgDataCor = pd.concat([orgMDSdatCor, targetForOrg2[['Target']]],
                                   axis=1)

    #json data --> to export to front end
    data = {}
    # data["randData"] = randomData.tolist()
    # data["stratData"] = stratifiedData.tolist()
    # data["originalData"] = anotherX.tolist()

    data['stratEigVal'] = stratEigVal.tolist()
    data['randEigVal'] = randEigVal.tolist()
    data['orgEigVal'] = orgEigVal.tolist()

    data['stratLoadFact'] = stratLoadFact.tolist()
    data['randLoadFact'] = randLoadFact.tolist()
    data['orgLoadFact'] = orgLoadFact.tolist()

    data['stratSigNum'] = stratIntrDimCount
    data['randSigNum'] = randIntrDimCount
    data['orgSigNum'] = orgIntrDimCount

    data['sumOfStratEig'] = contSumOfStratEig
    data['sumOfRandEig'] = contSumOfRandEig
    data['sumOfOrgEig'] = contSumOfOrgEig

    # data['strat3HighAttr'] = stratThreeHighAttr.tolist()
    # data['rand3HighAttr'] = randThreeHighAttr.tolist()
    # data['org3HighAttr'] = orgThreeHighAttr.tolist()

    data['pca2StratValues'] = np.array(finalDFStrat).tolist()
    data['pca2RandValues'] = np.array(finalDFRand).tolist()
    data['pca2OrgValues'] = np.array(finalDFOrg).tolist()

    data['stratMDSDataEuc'] = np.array(finalMDSStratDataEuc).tolist()
    data['randMDSDataEuc'] = np.array(finalMDSRandDataEuc).tolist()
    data['orgMDSDataEuc'] = np.array(finalMDSOrgDataEuc).tolist()

    data['stratMDSDataCor'] = np.array(finalMDSStratDataCor).tolist()
    data['randMDSDataCor'] = np.array(finalMDSRandDataCor).tolist()
    data['orgMDSDataCor'] = np.array(finalMDSOrgDataCor).tolist()

    data['strat3LoadData'] = np.array(strat3DataFinal).tolist()
    data['rand3LoadData'] = np.array(rand3DataFinal).tolist()
    data['org3LoadData'] = np.array(org3DataFinal).tolist()

    data['strat3AttrNames'] = stratColumns
    data['rand3AttrNames'] = randColumns
    data['org3AttrNames'] = orgColumns

    data['bigStrat3Array'] = bigStrat3Array
    data['bigRand3Array'] = bigRand3Array
    data['bigOrg3Array'] = bigOrg3Array

    data['lat'] = latitude
    data['long'] = longitude
    data['stratLat'] = stratLat
    data['stratLong'] = stratLong

    data['numberInEachState'] = numberInEachState
    data['origData'] = anotherX.tolist()

    data['avArray'] = avArray

    json_data = json.dumps(data)

    return json_data
Exemplo n.º 25
0
 def features(self, sampleRate = 0.05):
     path = self.path
     print("Importing Geographic Dataset")
     readGeoData = self._loadData(path,self.sample)
     
     userCoordinates = readGeoData[["id","latitude","longitude"]]
     del(readGeoData)
     
     print("Sanitizing Data")
     latitudeFilter = [self._isSanitized(lat) for lat in userCoordinates.latitude]
     longitudeFilter = [self._isSanitized(lon) for lon in userCoordinates.longitude]
     finalFilter = [lat and lon for lat, lon in zip(latitudeFilter, longitudeFilter)]
     sanitizedData = userCoordinates[finalFilter]
     
     print("Collecting Random Subsample")
     userCoordinateSample = sanitizedData.sample(int(len(sanitizedData)*sampleRate))
     userCoordinateSample = userCoordinateSample[["latitude","longitude"]]
     
     print("Generating Geographic Distance Matrix")
     transformedCoordinates = np.array(userCoordinateSample).astype(np.float)
     
     geoDistanceMatrix = pdist(transformedCoordinates, lambda a,b: geodesic((math.radians(a[0]),math.radians(a[1])),(math.radians(b[0]),math.radians(b[1]))).meters)
     del(transformedCoordinates)
     
     print("Executing Multidimensional Scaling Procedure")
     reshapedGeoDistMatrix = squareform(geoDistanceMatrix)
     del(geoDistanceMatrix)
     seed = np.random.RandomState(seed=3)
     
     mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
                        dissimilarity="precomputed", n_jobs=1)
     fittedMds = mds.fit(reshapedGeoDistMatrix)
     del(reshapedGeoDistMatrix)
     self.stress = fittedMds.stress_
     pos = fittedMds.embedding_
     
     print("Initiating Embedding Estimation for Entire Database")
     
     dataset = pd.DataFrame({"latitude": userCoordinateSample["latitude"], "longitude": userCoordinateSample["longitude"], "Y1":[element[0] for element in pos.tolist()], "Y2":[element[1] for element in pos.tolist()]})
     
     print("Training Model")
     training, validation, test = self._train_validate_test_split(dataset, train_percent=0.70, validate_percent=0.15)
     
     trainX = training[["latitude","longitude"]]
     trainY1 = training["Y1"]
     trainY2 = training["Y2"]
     
     validationX = validation[["latitude","longitude"]]
     validationY1 = validation["Y1"]
     validationY2 = validation["Y2"]
     
     testX = test[["latitude","longitude"]]
     testY1 = test["Y1"]
     testY2 = test["Y2"]
     
     print("Validating Model")
     n_neighbors = 5
     knn11 = neighbors.KNeighborsRegressor(n_neighbors, weights="distance")
     knn12 = neighbors.KNeighborsRegressor(n_neighbors, weights="distance")
     
     n_neighbors = 7
     knn21 = neighbors.KNeighborsRegressor(n_neighbors, weights="distance")
     knn22 = neighbors.KNeighborsRegressor(n_neighbors, weights="distance")
     
     n_neighbors = 11
     knn31 = neighbors.KNeighborsRegressor(n_neighbors, weights="distance")
     knn32 = neighbors.KNeighborsRegressor(n_neighbors, weights="distance")
     
     #Validation
     validationPredictedY11 = knn11.fit(trainX, trainY1).predict(validationX)
     validationPredictedY12 = knn12.fit(trainX, trainY2).predict(validationX)
     validationPredictedY21 = knn21.fit(trainX, trainY1).predict(validationX)
     validationPredictedY22 = knn22.fit(trainX, trainY2).predict(validationX)
     validationPredictedY31 = knn31.fit(trainX, trainY1).predict(validationX)
     validationPredictedY32 = knn32.fit(trainX, trainY2).predict(validationX)
     
     rSquared11 = self._rSquared(validationPredictedY11,validationY1)
     rSquared12 = self._rSquared(validationPredictedY12,validationY2)
     rSquared1 = np.mean([rSquared11, rSquared12])
     
     rSquared21 = self._rSquared(validationPredictedY21,validationY1)
     rSquared22 = self._rSquared(validationPredictedY22,validationY2)
     rSquared2 = np.mean([rSquared21, rSquared22])
     
     rSquared31 = self._rSquared(validationPredictedY31,validationY1)
     rSquared32 = self._rSquared(validationPredictedY32,validationY1)
     rSquared3 = np.mean([rSquared31, rSquared32])
     
     if rSquared1 == max([rSquared1,rSquared2,rSquared3]):
         knn1 = knn11
         knn2 = knn12
         print("Best K=5")
         #print("Best R-squared: "+str(rSquared1))
     elif rSquared2 == max([rSquared1,rSquared2,rSquared3]):
         knn1 = knn21
         knn2 = knn22
         print("Best K=7")
         #print("Best R-squared: "+str(rSquared2))
     else:
         knn1 = knn31
         knn2 = knn32
         print("Best K=11")
         #print("Best R-squared: "+str(rSquared3))
     
     del(validationPredictedY11)
     del(validationPredictedY12)
     del(validationPredictedY21)
     del(validationPredictedY22)
     del(validationPredictedY31)
     del(validationPredictedY32)
     
     print("Testing Model")
     #Test
     testPredictedY1 = knn1.fit(trainX, trainY1).predict(testX)
     testPredictedY2 = knn2.fit(trainX, trainY2).predict(testX)
     finalRSquared1 = self._rSquared(testPredictedY1,testY1)
     finalRSquared2 = self._rSquared(testPredictedY2,testY2)
     finalRSquared = np.mean([finalRSquared1, finalRSquared2])
     print("Final R-Squared: "+str(finalRSquared))
     
     del(testPredictedY1)
     del(testPredictedY2)
     
     print("Deploying Model")
     #Deployment
     finalModel1 = knn1.fit(trainX, trainY1)
     finalModel2 = knn2.fit(trainX, trainY2)
     finalPos1 = finalModel1.predict(sanitizedData[['latitude','longitude']])
     finalPos2 = finalModel2.predict(sanitizedData[['latitude','longitude']])
     
     print("Normalizing Position Vectors")
     normalizedPos1 = (finalPos1-min(finalPos1))/(max(finalPos1)-min(finalPos1))
     normalizedPos2 = (finalPos2-min(finalPos2))/(max(finalPos2)-min(finalPos2))
     
     normalizedIdPos = pd.DataFrame({"id":list([str(row) for row in sanitizedData['id']]), "x": normalizedPos1, "y": normalizedPos2})
             
     return normalizedIdPos.to_sparse()
Exemplo n.º 26
0
Arquivo: tSNE_2.py Projeto: mlnn/tSNE
t0 = time()
trans_data = manifold.Isomap(n_neighbors, n_components=2)\
    .fit_transform(sphere_data).T
t1 = time()
print("%s: %.2g sec" % ('ISO', t1 - t0))

ax = fig.add_subplot(257)
plt.scatter(trans_data[0], trans_data[1], c=colors, cmap=plt.cm.rainbow)
plt.title("%s (%.2g sec)" % ('Isomap', t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')

# Perform Multi-dimensional scaling.
t0 = time()
mds = manifold.MDS(2, max_iter=100, n_init=1)
trans_data = mds.fit_transform(sphere_data).T
t1 = time()
print("MDS: %.2g sec" % (t1 - t0))

ax = fig.add_subplot(258)
plt.scatter(trans_data[0], trans_data[1], c=colors, cmap=plt.cm.rainbow)
plt.title("MDS (%.2g sec)" % (t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')

# Perform Spectral Embedding.
t0 = time()
se = manifold.SpectralEmbedding(n_components=2,
                                n_neighbors=n_neighbors)
Exemplo n.º 27
0
# LTSA embedding of the digits dataset
print("Computing LTSA embedding")
clf = manifold.LocallyLinearEmbedding(n_neighbors,
                                      n_components=2,
                                      method='ltsa')
t0 = time()
X_ltsa = clf.fit_transform(X)
print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
plot_embedding(
    X_ltsa,
    "Local Tangent Space Alignment of the digits (time %.2fs)" % (time() - t0))

#----------------------------------------------------------------------
# MDS  embedding of the digits dataset
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
t0 = time()
X_mds = clf.fit_transform(X)
print("Done. Stress: %f" % clf.stress_)
plot_embedding(X_mds,
               "MDS embedding of the digits (time %.2fs)" % (time() - t0))

#----------------------------------------------------------------------
# Random Trees embedding of the digits dataset
print("Computing Totally Random Trees embedding")
hasher = ensemble.RandomTreesEmbedding(n_estimators=200,
                                       random_state=0,
                                       max_depth=5)
t0 = time()
X_transformed = hasher.fit_transform(X)
pca = decomposition.RandomizedPCA(n_components=2)
Exemplo n.º 28
0
    def layout(self, layoutType='graph'):

        ###### need domain information for generating layout using DR
        # domain = self.f[:,:-1]

        if layoutType == 'graph':
            # g = nx.Graph()
            g = Graph('ER', engine='neato')

            i = 0
            index_map = dict()
            inverse_index_map = dict()
            # for ext in self.seg.keys():
            for s in self.graph.keys():

                for ext in self.graph[s]:
                    index_map[ext] = i
                    inverse_index_map[i] = ext

                    if ext in self.previousPos.keys():
                        ppos = self.previousPos[ext]
                        # print 'previous position is provided for:', ext, ppos[0], ppos[1]
                        g.node('%d' % ext, pos="%f,%f" % (ppos[0], ppos[1]))
                    else:
                        # print "\n\n No previousPos provided for:", ext, '\n\n'
                        g.node('%d' % ext)

                    i += 1

            for s in self.graph.keys():
                index_map[s] = i
                inverse_index_map[i] = s
                if s in self.previousPos.keys():
                    ppos = self.previousPos[s]
                    # print 'previous position is provided for:', s, ppos[0], ppos[1]
                    g.node('%d' % s, pos="%f,%f" % (ppos[0], ppos[1]))
                else:
                    # print "\n\n No previousPos provided for:", ext, '\n\n'
                    g.node('%d' % s)
                i += 1

            for s in self.graph.keys():
                for ext in self.graph[s]:

                    g.edge(str(s),
                           str(ext),
                           len=str(linalg.norm(self.loc[s] - self.loc[ext])))
                    # g.add_edge(index_map[s], index_map[ext], weight=linalg.norm(f[s,:-1] - f[ext,:-1]))

            self.currentNodeSize = len(index_map)

            if self.currentNodeSize == self.previousNodeSize:
                return

            self.previousNodeSize = self.currentNodeSize
            ##print "============== Render Spine =============="

            g.format = 'plain'
            path = g.render('spine')
            input = open('spine.plain')

            for l in input.readlines():
                l = l.split()
                # print l

                if l[0] == 'node':
                    self.pos[int(l[1])] = [float(l[2]), float(l[3])]
                    #give position to all children
                    #only if l[1] is extrema
                    #init the position all the point to the extrema pos
                    ## FIXME update children extrema to the parent location
                    # print("extremaSet = ", self.extremaSet)
                    #if int(l[1]) in self.extremaSet:
                    #     self.pos.update(dict.fromkeys(self.extremaSet[int(l[1])], [float(l[2]),float(l[3])]))

        elif layoutType == 'PCA':
            if recomputePCA:
                self.pca = decomposition.PCA(n_components=2)
                self.pca.fit(domain)
            pos2D = self.pca.transform(domain).tolist()

            for index, val in enumerate(pos2D):
                self.pos[nodeIndicesList[index]] = val

        elif layoutType == 'MDS':
            mds = manifold.MDS(n_components=2, max_iter=100, n_init=1)
            pos2D = mds.fit_transform(domain).tolist()

            for index, val in enumerate(pos2D):
                self.pos[nodeIndicesList[index]] = val

        elif layoutType == 'tSNE':
            tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
            pos2D = tsne.fit_transform(domain).tolist()
            for index, val in enumerate(pos2D):
                self.pos[nodeIndicesList[index]] = val

        else:  #PCA is the default
            pca = decomposition.PCA(n_components=2)
            pos2D = pca.fit_transform(domain).tolist()
            for index, val in enumerate(pos2D):
                self.pos[nodeIndicesList[index]] = val

        #store position
        # print self.pos
        # print "compare self.pos vs self.previousPos"
        # for p in self.previousPos.keys():
        # if self.previousPos[p] != self.pos[p]:
        # print p, self.previousPos[p], self.pos[p]

        self.previousPos = self.pos

        #if only layout one point clear the position cache
        if len(self.graph.keys()) <= 1:
            self.previousPos = dict()
    [587,    0,  920,  940, 1745, 1188,  713, 1858, 1737,  597],
    [1212,  920,    0,  878,  831, 1726, 1631,  949, 1021, 1494],
    [701,  940,  878,    0, 1374,  968, 1420, 1645, 1891, 1220],
    [1936, 1745,  831, 1374,    0, 2339, 2451,  347,  959, 2300],
    [604, 1188, 1726,  968, 2339,    0, 1092, 2594, 2734,  923],
    [748,  713, 1631, 1420, 2451, 1092,    0, 2571, 2408,  205],
    [2139, 1858,  949, 1645,  347, 2594, 2571,    0,  678, 2442],
    [2182, 1737, 1021, 1891,  959, 2734, 2408,  678,    0, 2329],
    [543,  597, 1494, 1220, 2300,  923,  205, 2442, 2329,    0]])

# check to see that the distance structure has been entered correctly
print(distance_matrix)
print(type(distance_matrix))

# apply the multidimensional scaling algorithm and plot the map
mds_method = manifold.MDS(n_components = 2, random_state = 9999,\
    dissimilarity = 'precomputed')
mds_fit = mds_method.fit(distance_matrix)
mds_coordinates = mds_method.fit_transform(distance_matrix)

city_label = [
    'Atlanta', 'Chicago', 'Denver', 'Houston', 'Los Angeles', 'Miami',
    'New York', 'San Francisco', 'Seattle', 'Washington D.C.'
]

# plot mds solution in two dimensions using city labels
# defined by multidimensional scaling
plt.figure()
plt.scatter(mds_coordinates[:,0],mds_coordinates[:,1],\
    facecolors = 'none', edgecolors = 'none')  # points in white (invisible)
labels = city_label
for label, x, y in zip(labels, mds_coordinates[:, 0], mds_coordinates[:, 1]):
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean, pdist, squareform
from matplotlib import pyplot as plt
from sklearn import manifold

data = pd.read_excel("matrix.xlSx")


def similarity_func(u, v):
    return 1 / (1 + euclidean(u, v))


dists = pdist(data[data.columns[1:]], similarity_func)
similarities = pd.DataFrame(squareform(dists))

mds = manifold.MDS(n_components=2,
                   max_iter=200,
                   eps=1e-9,
                   dissimilarity="precomputed",
                   n_jobs=1)
pos = mds.fit(similarities).embedding_

plt.scatter(pos[:, 0], pos[:, 1], color='turquoise', s=111, lw=0, label='MDS')
plt.savefig('plot.png')
plt.close()