예제 #1
0
def mds(data, n_components=300):
    embedding = MDS(n_components=n_components)
    new_data = embedding.fit_transform(data)
    return new_data
예제 #2
0
from applied_machine_learning.fundamentals_of_machine_learning.adspy_shared_utilities import plot_labelled_scatter
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import MDS
import pandas as pd
import matplotlib.pyplot as plt

fruits = pd.read_table('../resources/fruit_data_with_colors.txt')
feature_names_fruits = ['height', 'width', 'mass', 'color_score']
target_fruit_names = ['apple', 'mandarin', 'orange', 'lemon']
X_fruits = fruits[feature_names_fruits]
y_fruits = fruits['fruit_label']

# each feature should be centered (zero mean) and with unit variance
X_fruits_normalized = StandardScaler().fit(X_fruits).transform(X_fruits)

mds = MDS(n_components=2)

X_fruits_mds = mds.fit_transform(X_fruits_normalized)

plot_labelled_scatter(X_fruits_mds, y_fruits,
                      ['apple', 'mandarin', 'orange', 'lemon'])
plt.xlabel('First MDS feature')
plt.ylabel('Second MDS feature')
plt.title('Fruit sample dataset MDS')
예제 #3
0
def mds(k, X):
    MDSmodel = MDS(n_components=k)
    mdsresult = MDSmodel.fit_transform(X)
    np.savetxt("MDS_out.csv", mdsresult, delimiter=",")
    return None
예제 #4
0
    def vizualize2d(self, n_frac=0.01, b_annotations=False):
        n_components = 2
        env = Environment()
        c = OpenCorpus()
        di_g = c.grammemes(mode=1)
        data = self.tokenz().sample(frac=n_frac)

        data = data.fillna(0)
        #print(data['idgram'].shape)
        #print(data.index.shape)
        tdf = pd.DataFrame(index=data.index)
        tdf['idgram'] = data['idgram']
        tdf['gram'] = data['gram']
        tdf['word'] = data['word']
        #print(tdf)

        drop_columns = [
            'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3',
            'n_token'
        ]  # , 'bgm_l_None'
        # drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns])
        env.debug(
            1,
            ['POStagger', 'visualize2D',
             'Drop colums: %s' % (drop_columns)])
        data = data.drop(columns=drop_columns, axis=1)
        values = data.values
        X = values[:, 1:]
        y = values[:, 0]
        #print(data.head,X, y)
        #return 0

        #Scalers
        sc = StandardScaler()
        min_max_scaler = preprocessing.MinMaxScaler()
        max_abs_scaler = preprocessing.MaxAbsScaler()
        #X = sc.fit_transform(X)

        #PCA
        b_pca = False
        b_sne = True
        if b_pca:
            model = PCA(n_components=n_components)
        if b_sne:
            model = MDS(n_components=n_components)  #TSNE
        X_new = model.fit_transform(X, y)
        if b_pca:
            print('PCA ratio', n_components, 'components',
                  model.explained_variance_ratio_)
        #X_new = sc.fit_transform(X_new)
        #X_new = preprocessing.scale(X_new)
        if b_pca:
            X_new = max_abs_scaler.fit_transform(X_new)
        #return 0

        #tdf = pd.DataFrame(data=X_new, columns=['PC1', 'PC2'], index=data.index)
        tdf['PC1'] = X_new[:, 0]
        tdf['PC2'] = X_new[:, 1]
        #finalDf = pd.concat([tdf, data[['idgram']]], axis=1)
        df_groups = tdf.groupby('idgram').count()
        #print(df_groups)
        #return 0
        tdf['counts'] = 0
        for index, serie in tdf.iterrows():
            n_idgram = tdf.at[index, 'idgram']
            tdf.at[index,
                   'counts'] = df_groups[df_groups.index == n_idgram]['gram']
        tdf = tdf.sort_values(by=['counts'], ascending=False)
        #print(tdf)

        #Draw
        i = 0
        N = df_groups.shape[0]
        s_title = ''
        if b_pca:
            s_title = '2 component PCA. Точность %s' % (round(
                sum(float(i) for i in model.explained_variance_ratio_), 2))
        if b_sne:
            s_title = 't-SNE'

        #Plotly
        if False:  #Plotly
            py.sign_in('shashmaxus', 'AdfwTulrOoV3cSlbZT3B')
            c = [
                'hsl(' + str(h) + ',50%' + ',50%)'
                for h in np.linspace(0, 360, N)
            ]
            data_trace = []
            for index, row in df_groups.iterrows():
                #print(index)
                df_trace = tdf[tdf['idgram'] == index]
                #print(df_trace)
                g_trace = go.Scatter(
                    x=df_trace['PC1'].values,
                    y=df_trace['PC2'].values,
                    name=df_trace['gram'].values[0],
                    mode='markers',  #'markers+text'
                    marker=dict(
                        size=8,
                        color=i,  #c[i]
                        opacity=0.8,
                        colorscale='Viridis'),
                    text=df_trace['word'],
                    textfont=dict(family='sans serif', size=12))
                data_trace.append(g_trace)
                i += 1
            layout = go.Layout(
                title=s_title_pca,
                xaxis=dict(
                    title=('Component 1. Вклад %s' %
                           (round(pca.explained_variance_ratio_[0], 2)))),
                yaxis=dict(
                    title=('Component 2. Вклад %s' %
                           (round(pca.explained_variance_ratio_[1], 2)))))
            fig2 = go.Figure(data=data_trace, layout=layout)
            py.image.save_as(fig2,
                             filename='c:/prj/mlivos_data/temp/Words2.png')

        #Bokeh
        if True:
            palette = d3['Category20'][len(tdf['gram'].unique())]
            #palette = all_palettes['Category20'][len(tdf['gram'].unique())]
            #palette = Viridis256[len(tdf['gram'].unique())]
            #palette = Viridis256
            color_map = CategoricalColorMapper(factors=tdf['gram'].unique(),
                                               palette=palette)
            #print(mapper)
            fig = figure(title=s_title, toolbar_location=None)
            source = ColumnDataSource(tdf[['gram', 'PC1', 'PC2']])
            fig.scatter(x='PC1',
                        y='PC2',
                        size=12,
                        color={
                            'field': 'gram',
                            'transform': color_map
                        },
                        legend='gram',
                        source=source)
            show(fig)
            export_png(fig, filename="c:/prj/mlivos_data/temp/PCA.png")
        return 0
예제 #5
0
 def mds(data, dim=3):
     embedding = MDS(n_components=dim)
     result = embedding.fit_transform(data)
     return result
예제 #6
0
data = data.drop('Legendary', axis=1)

accuracy = []
X = data.iloc[:, :]
y = labels
X = np.array(X)
print("X shape: ", X.shape)

transformer = FactorAnalysis(n_components=2, random_state=0)
X_fa = transformer.fit_transform(X)

fig = plt.figure(figsize=(10, 10))
plt.scatter(X_fa[:, 0], X_fa[:, 1])
plt.show()

embedding = MDS(n_components=2)
X_mds = embedding.fit_transform(X)

fig = plt.figure(figsize=(10, 10))
plt.scatter(X_mds[:, 0], X_mds[:, 1])
plt.show()

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=5)
x_train_fa, x_test_fa, y_train_fa, y_test_fa = train_test_split(X_fa,
                                                                y,
                                                                test_size=0.33,
                                                                random_state=5)
x_train_mds, x_test_mds, y_train_mds, y_test_mds = train_test_split(
def rank_graph():
    MDS()
    # convert two components as we're plotting points in a two-dimensional plane
    # "precomputed" because we provide a distance matrix
    # we will also specify `random_state` so the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

    pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

    xs, ys = pos[:, 0], pos[:, 1]
    print()
    print()

    #set up colors per clusters using a dict
    cluster_colors = {
        0: '#1b9e77',
        1: '#d95f02',
        2: '#7570b3',
        3: '#e7298a',
        4: '#66a61e'
    }
    #set up cluster names using a dict
    cluster_names = {0: 'Top', 1: 'Bottom', 2: '3', 3: '4', 4: '5'}
    #some ipython magic to show the matplotlib plots inline
    get_ipython().run_line_magic('matplotlib', 'inline')
    #create data frame that has the result of the MDS plus the cluster numbers and titles
    df = pd.DataFrame(dict(x=xs, y=ys, title=titles, rank=ranks))
    #group by cluster
    groups = df.groupby('rank')
    # set up plot
    fig, ax = plt.subplots(figsize=(17, 9))  # set size
    ax.margins(0.05)  # Optional, just adds 5% padding to the autoscaling
    #iterate through groups to layer the plot
    #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
    for name, group in groups:
        ax.plot(group.x,
                group.y,
                marker='o',
                linestyle='',
                ms=12,
                label=cluster_names[name],
                color=cluster_colors[name],
                mec='none')
        ax.set_aspect('auto')
        ax.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            bottom='off',  # ticks along the bottom edge are off
            top='off',  # ticks along the top edge are off
            labelbottom='off')
        ax.tick_params(
            axis='y',  # changes apply to the y-axis
            which='both',  # both major and minor ticks are affected
            left='off',  # ticks along the bottom edge are off
            top='off',  # ticks along the top edge are off
            labelleft='off')

    ax.legend(numpoints=1)  #show legend with only 1 point

    #add label in x,y position with the label as the film title
    for i in range(len(df)):
        ax.text(df.loc[i]['x'], df.loc[i]['y'], df.loc[i]['title'], size=8)
    plt.show()  #show the plot
예제 #8
0
                for i in range(np.sum(nbpc[:c1]), np.sum(nbpc[:c1 + 1])):
                    for j in range(np.sum(nbpc[:c2]), np.sum(nbpc[:c2 + 1])):
                        if rng.rand() <= P[c1, c2]:
                            C[i, j] = 1

    return C + C.T


n = 100
nc = 3
ratio = np.array([.5, .3, .2])
P = np.array(0.6 * np.eye(3) + 0.05 * np.ones((3, 3)))
C1 = get_sbm(n, nc, ratio, P)

# get 2d position for nodes
x1 = MDS(dissimilarity='precomputed', random_state=0).fit_transform(1 - C1)


def plot_graph(x, C, color='C0', s=None):
    for j in range(C.shape[0]):
        for i in range(j):
            if C[i, j] > 0:
                pl.plot([x[i, 0], x[j, 0]], [x[i, 1], x[j, 1]],
                        alpha=0.2,
                        color='k')
    pl.scatter(x[:, 0],
               x[:, 1],
               c=color,
               s=s,
               zorder=10,
               edgecolors='k',
예제 #9
0
# Calculate JSD matrix without shock
ddmat = jsd_matrix(odat, "dayhour")

# Calculate JSD matrix with shock
dmat = jsd_matrix(dis, "dayhour")

# Check matrix
print(dmat)

np.save(
    '/home/server/pi/homes/woodilla/Projects/Anomalous-Detection-Browning-Simulation/data/jsd_mat.npy',
    dmat)

#------------------------------------------------------------------------
# Metric-MDS 5-dimensions without shock
nmds = MDS(n_components=5, metric=True, dissimilarity='precomputed')
nmds_dat = nmds.fit_transform(ddmat)

ndat = pd.DataFrame({"x": nmds_dat[:, 0], "y": nmds_dat[:, 1]})

ndat['x2'] = ndat['x'].shift(-1)
ndat['y2'] = ndat['y'].shift(-1)

ndat['distance'] = np.sqrt((ndat['x2'] - ndat['x'])**2 +
                           (ndat['y2'] - ndat['y'])**2)

# Calculate speed
ndat['speed'] = ndat['distance'] / 1

# Plot speed
plt = sns.scatterplot(x=range(len(ndat)), y=ndat['speed'], edgecolor="black")
예제 #10
0
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.xticks(np.arange(-4, 6, 1))
plt.savefig("static/images/ScatterPlot_Stratified.png", dpi=300)

print "Plotting PCA Scatter Plot Random"
plt.figure(3, figsize=(12, 6))
ax = plt.subplot(111, facecolor='lightgray')
ax.plot(x2, y2, 'o', markersize=8, color='blue', alpha=0.5)
plt.title('PCA Scatter Plot Random')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.xticks(np.arange(-4, 6, 1))
plt.savefig("static/images/ScatterPlot_Random.png", dpi=300)

mds = MDS(n_components=2, dissimilarity="euclidean")
results = mds.fit(X)
coordsX = results.embedding_

print "Plotting MDS Scatter Plot Euclidean - Stratified"
plt.figure(4, figsize=(12, 6))
ax = plt.subplot(111, facecolor='lightgray')
plt.scatter(coordsX[:, 0], coordsX[:, 1], marker='o', color='blue')
plt.title('MDS Scatter Plot Euclidean - Stratified')
plt.savefig("static/images/MDS_Stratified_Euclidean.png", dpi=300)

results = mds.fit(Y)
coordsY = results.embedding_

print "Plotting MDS Scatter Plot Euclidean - Random"
plt.figure(5, figsize=(12, 6))
예제 #11
0
def NMDS_Plot(df):

    outFile = sys.argv[1] + '.NMDS.pdf'
    Treatment = []
    Samples = list(df.Samples)
    Treatment = [s[:2] for s in Samples]

    #	for index, row in df.iterrows():
    #		Treatment.append(row['Samples'][:2])

    df['Treatment'] = Treatment
    tcount = list(df.groupby('Treatment').count().Samples)
    t = list(df.Treatment.unique())
    tt = list(zip(t, tcount))

    print(t)
    print(tcount)

    del df['Samples']
    del df['Treatment']

    m = ['o', 'v', '^', 's', 'p', 'P', '*', 'X', 'd', 'x']

    c = [
        '#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c',
        '#fdbf6f', '#ff7f00', '#cab2d6', '#6a3d9a'
    ]

    seed = np.random.RandomState(seed=3)
    nmds = MDS(n_components=2,
               metric=False,
               max_iter=10000,
               eps=1e-12,
               dissimilarity="precomputed",
               random_state=seed,
               n_jobs=1,
               n_init=10000)

    npos = nmds.fit_transform(df)

    distances = []

    for i in range(len(npos) - 1):
        j = i + 1
        distances.append(
            np.sqrt((npos[j, 0] - npos[i, 0])**2 +
                    (npos[j, 1] - npos[i, 1])**2))

    stress = np.sqrt(nmds.stress_ / sum(distances)**2)
    stext = 'Stress = %f' % (stress)

    df2 = pd.DataFrame({'Sample': Samples, 'X': npos[:, 0], 'Y': npos[:, 1]})
    df_file = sys.argv[1] + '_NMDS_DataFrame.tsv'
    df2.to_csv(df_file, sep='\t')

    a = 0
    for i, j in enumerate(tt):
        b = a + j[1]
        plt.scatter(npos[a:b, 0],
                    npos[a:b, 1],
                    c=c[i],
                    marker=m[i],
                    label=j[0])
        a += j[1]

    plt.subplots_adjust(right=0.7)
    ax = plt.gca()
    plt.axis('equal')
    plt.title('NMDS plot of Mash Distance')
    plt.legend(frameon=False, bbox_to_anchor=(1.04, 1.05), loc="upper left")
    plt.text(1,
             .010,
             stext,
             fontsize=10,
             color='#737373',
             horizontalalignment='right',
             transform=ax.transAxes)
    plt.savefig(outFile)
    plt.close()
예제 #12
0
def plot_clusters(num_clusters,
                  feature_matrix,
                  cluster_data,
                  book_data,
                  plot_size=(16, 8)):
    # generate random color for clusters
    def generate_random_color():
        color = '#%06x' % random.randint(0, 0xFFFFFF)
        return color

    # define markers for clusters
    markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd']
    # build cosine distance matrix
    cosine_distance = 1 - cosine_similarity(feature_matrix)
    # dimensionality reduction using MDS
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    # get coordinates of clusters in new low-dimensional space
    plot_positions = mds.fit_transform(cosine_distance)
    x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1]
    # build cluster plotting data
    cluster_color_map = {}
    cluster_name_map = {}
    for cluster_num, cluster_details in cluster_data[:].items():
        # assign cluster features to unique label
        cluster_color_map[cluster_num] = generate_random_color()
        cluster_name_map[cluster_num] = ', '.join(
            cluster_details['key_features'][:5]).strip()
    # map each unique cluster label with its coordinates and books
    cluster_plot_frame = pd.DataFrame({
        'x':
        x_pos,
        'y':
        y_pos,
        'label':
        book_data['Cluster'].values.tolist(),
        'title':
        book_data['title'].values.tolist()
    })
    grouped_plot_frame = cluster_plot_frame.groupby('label')
    # set plot figure size and axes
    fig, ax = plt.subplots(figsize=plot_size)
    ax.margins(0.05)
    # plot each cluster using co-ordinates and book titles
    for cluster_num, cluster_frame in grouped_plot_frame:
        marker = markers[cluster_num] if cluster_num < len(markers) \
            else np.random.choice(markers, size=1)[0]
        ax.plot(cluster_frame['x'],
                cluster_frame['y'],
                marker=marker,
                linestyle='',
                ms=12,
                label=cluster_name_map[cluster_num],
                color=cluster_color_map[cluster_num],
                mec='none')
        ax.set_aspect('auto')
        ax.tick_params(axis='x',
                       which='both',
                       bottom='off',
                       top='off',
                       labelbottom='off')
        ax.tick_params(axis='y',
                       which='both',
                       left='off',
                       top='off',
                       labelleft='off')
    fontP = FontProperties()
    fontP.set_size('small')
    ax.legend(loc='upper center',
              bbox_to_anchor=(0.5, -0.01),
              fancybox=True,
              shadow=True,
              ncol=5,
              numpoints=1,
              prop=fontP)
    # add labels as the film titles
    for index in range(len(cluster_plot_frame)):
        ax.text(cluster_plot_frame.ix[index]['x'],
                cluster_plot_frame.ix[index]['y'],
                cluster_plot_frame.ix[index]['title'],
                size=8)
        # show the plot
    plt.show()
예제 #13
0
            for member in temp:
                member.append(1)
        reslist.extend(temp)
    return reslist


# 警告!
# 该脚本用于生成操作向量的嵌入 因为操作是一个[10,11]的向量 分别表示10帧 11个按键控制(1和0分别表示按下或者没按下)
# 采用sklearn的多维度缩放来将[10,15]转换成[10,1000] 相当于操作向量嵌入作为outputEmbedding
# 因为MDS具有随机性 所以如果不想从头训练 不要运行这个脚本
if __name__ == "__main__":
    opVec = genOnehot(11)
    opDict = {}
    for i in range(len(opVec)):
        opDict[i] = list(opVec[i])
    print(opDict)
    embedding = MDS(n_components=1000)
    opVec_embedding = embedding.fit_transform(opVec)
    op_embedding_Dict = {}
    for i in range(len(opVec_embedding)):
        op_embedding_Dict[i] = list(opVec_embedding[i])
    print(op_embedding_Dict)
    op_embedding_Dict_js = json.dumps(op_embedding_Dict, indent=1)
    opDict_js = json.dumps(opDict, indent=1)
    f = open('../config/embedding.txt', 'w')
    f.write(op_embedding_Dict_js)
    f.close()
    f = open('../config/opDict.txt', 'w')
    f.write(opDict_js)
    f.close()
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    display(SVG(svg.replace("svg:", "")))


# ## Different lenses
# Here I am going to try to do a multidimensional scaling analysis of the distance data to utal the most important dimesions. This requires a bit of thinking on what the MDS actually outputs, and how it relates to the physical features of chemical space.
# 
# We must use a nonlinear MDS because $ 1 - T_C $ is not necessarily positive semidefinite.

# In[22]:


from sklearn.manifold import MDS
print(sq_distance_matrix.shape)
transformed_data = MDS(n_components=2, dissimilarity="precomputed", metric=False).fit_transform(sq_distance_matrix)
transformed_data


# In[25]:


plt.scatter(transformed_data[:, 0], transformed_data[:, 1])


# In[24]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
plt.imshow(sq_distance_matrix, zorder=2, cmap='Blues', interpolation='nearest')
예제 #15
0
print(Dmat)

################################################
# real Dmat
t2 = timeit.default_timer()
Dmat_real = np.zeros((len(S), len(S)))
for i in range(k):
    for j in range(i + 1, k):
        Dmat_real[i,
                  j] = np.linalg.norm(np.matmul(w, S[i]) - np.matmul(w, S[j]),
                                      ord=1)
        Dmat_real[j, i] = Dmat_real[i, j]
t3 = timeit.default_timer()
################################################
# PCA
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
mds = MDS(n_components=2, dissimilarity='precomputed')
pos = mds.fit(Dmat).embedding_
plt.figure()
plt.scatter(pos[:, 0], pos[:, 1], color='black', s=10, lw=0)
plt.title("Estimated, time=%s" % (t1 - t0))
plt.show()

mds = MDS(n_components=2, dissimilarity='precomputed')
pos = mds.fit(Dmat_real).embedding_
plt.figure()
plt.scatter(pos[:, 0], pos[:, 1], color='black', s=10, lw=0)
plt.title("True, time=%s" % (t3 - t2))
plt.show()
예제 #16
0
def main_function(num_clusters, retokenize, recluster, corpusdir, dataset_path, n_words, minibatch, num_processes):
    try:
        nltk.data.find('tokenizers/stopwords')
    except:
        stopwords = nltk.download('stopwords')
    
    try:
        nltk.data.find('tokenizers/punkt')
    except:
        nltk.download('punkt')

    stemmer = SnowballStemmer("english")

    dataset_name, file_place = initialize_output_location(dataset_path)
    trailer_text = dataset_name + "_k=" + str(num_clusters)
    
    print("\nAll outputs generated will be in \"~\\cluster-datalake-outputs\\" + dataset_name + "--output\"")

    #=========1=========2=========3=========4=========5=========6=======

    # tokenize and cluster    
    fnames, dataset = to_retokenize(retokenize, corpusdir, dataset_path, num_processes)
    tfidf_matrix = np.load(os.path.join(file_place, "tfidf_matrix_" + dataset_name + ".npy")).item()
    to_recluster(num_clusters, retokenize, recluster, tfidf_matrix, dataset_path, minibatch)
    
    # load in existing saved files
    km = joblib.load(os.path.join(file_place, 'doc_cluster_' + trailer_text + '.pkl'))
    vocab_frame = pd.read_pickle(os.path.join(file_place, "vocab_frame_" + dataset_name + ".pkl"))
    terms = np.load(os.path.join(file_place, "terms_" + dataset_name + ".npy")).tolist()
    dist = np.load(os.path.join(file_place, "distance_matrix_" + dataset_name + ".npy"))
    print("\nLoaded in existing dependencies...\n")

    clusters = km.labels_.tolist()

    # get the actual number of clusters in the dataframe
    distinct_cluster_labels = []
    for label in clusters:
        if label not in distinct_cluster_labels:
            distinct_cluster_labels.append(label)
    
    # create a dictionary "db" of filenames, contents, and clusters
    db = {'filename': fnames, 'content': dataset, 'cluster': clusters}
    # convert "db" to a pandas dataframe
    frame = pd.DataFrame(db, index=[clusters], columns=['filename','cluster'])
    # print the number of files in each cluster
    #print("Number of files in each cluster: ")
    #print(frame['cluster'].value_counts())

    #=========1=========2=========3=========4=========5=========6=======
   
    # open file writer for result output
    fwriter = open(os.path.join(file_place, "doc_clusters_" + trailer_text + ".txt"), "w")
    fwriter.write("Clusters from text files in: " + corpusdir)

    fwriter.write("\nTop terms per cluster: \n\n")
    print("Top terms per cluster: \n")

    #sort cluster centers by proximity to centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
   
    all_cluster_words = {}
    # for each cluster

    ''' terms contains all the feature labels of the clustering
        vocab_frame contains all the tokens mapped to their stemmed counterparts
        you're finding the token version of the stem you get
        the stem you get is at the ind position of the terms list
        ind is from order_centroids
        order_centroids is a sorted array with num_clusters rows and len(terms) features
        order_centroids[i] is the coordinates of cluster i
        order_centroids[i,:] is the coordinates from ALL features for cluster i'''
         
    distinct_cluster_labels = sorted(distinct_cluster_labels)
    for i in distinct_cluster_labels:
        fwriter.write("Cluster " + str(i) + " words: ")
        print("Cluster %d words:" % i, end='')
        cluster_words = [] 
        seen = []
        
        # print the first "n_words" words in a cluster
        for ind in order_centroids[i, : n_words]:
            
            print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0], end=",")
            fwriter.write(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].rstrip('\n') + ", ")
            cluster_words.append(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0])
        print()
        fwriter.write("\n")
         
        all_cluster_words.update({i:cluster_words})

        # print out the filenames in the cluster
        print("Cluster %d filenames:" % i, end='')
        fwriter.write("Cluster " + str(i) + " filenames: ")
        for filename in frame.loc[i]['filename'].values.tolist():
            print(' %s,' % filename, end='')
            fwriter.write(filename.rstrip('\n') + ", ")
        print("\n")
        fwriter.write("\n\n") 

    fwriter.close()
    print("Output written to \"doc_clusters_" + trailer_text + ".txt\"")

    #=========1=========2=========3=========4=========5=========6========
    
     
    if not os.path.isfile(os.path.join(file_place, "mds_pos_" + trailer_text + ".npy")):
        retokenize = "1"
    
    if retokenize == "1":
        # multidimensional scaling: convert distance matrix into 3-dimensions
        mds = MDS(n_components=3, dissimilarity="precomputed", random_state=1)
        print("\nFitting the distance matrix into 3 dimensions...")
        pos_save = mds.fit_transform(dist)  # shape (n_components, n_samples)
        np.save(os.path.join(file_place, "mds_pos_" + trailer_text + ".npy"), pos_save)
    
    position_array = np.load(os.path.join(file_place, "mds_pos_" + trailer_text + ".npy"))
    print("Loaded existing MDS fit.")
    pos = position_array
    xs, ys, zs = pos[:, 0], pos[:, 1], pos[:, 2]

    # set up plot
    fig = plt.figure(figsize=(17,9))
    ax = Axes3D(fig)

    # create data frame with MDS results, cluster numbers, and filenames
    df = pd.DataFrame(dict(x=xs, y=ys, z=zs, label=clusters, filename=fnames)) 
    
    # group by cluster
    groups = df.groupby('label')

    # for each cluster, plot the files in that cluster
    print("\n\nPlotting scatterplot of cluster points...")
    for name, group in tqdm(groups):
        # color = ('#%06X' % random.randint(0,256**3-1))
        color = np.random.rand(3,)
        for t in range(group.shape[0]):
            ax.scatter(group.x.iloc[t], group.y.iloc[t], group.z.iloc[t], 
                c=color, marker='o')
            ax.set_aspect('auto')

    plt.savefig(os.path.join(file_place, "3D_document_cluster_" + trailer_text + ".svg"), dpi=300)
    print("Scatter plot written to \"3D_document_cluster_" + trailer_text + ".svg\"")
      
       
    return frame, all_cluster_words, distinct_cluster_labels
예제 #17
0
# Kmeans++
km = KMeans(n_clusters=29,
            init='k-means++',
            max_iter=300,
            n_init=1,
            verbose=0,
            random_state=3425)
km.fit(tfidf_matrix)
labels = km.labels_
clusters = labels.tolist()

# Calculating the distance measure derived from cosine similarity
distance = 1 - cosine_similarity(tfidf_matrix)

# Dimensionality reduction using Multidimensional scaling (MDS)
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(distance)
xs, ys = pos[:, 0], pos[:, 1]

# Saving cluster visualization after mutidimensional scaling
for x, y, in zip(xs, ys):
    plt.scatter(x, y)

    # Creating dataframe containing reduced dimensions, identified labels and text data for plotting KMeans output
result = pd.DataFrame(dict(label=clusters, data=text, x=xs, y=ys))
topic.to_csv(os.path.join(outfile, 'kmeans_clustered_DFN.csv'), sep=';')

#List of cluster
listcluster = result.groupby('label').size()
예제 #18
0
# Multidimensional Scaling
from sklearn.manifold import MDS


# パラメータの設定
n_components = 2
n_init = 12
max_iter = 1200
metric = True
n_jobs = 4
random_state = 2018


# インスタンスの作成
mds = MDS(n_components=n_components, n_init=n_init, max_iter=max_iter,
          metric=metric, n_jobs=n_jobs, random_state=random_state)


# NDSの実行
X_train_mds = mds.fit_transform(X_train.loc[0:1000, :])


# データフレームに変換
X_train_mds = pd.DataFrame(data=X_train_mds, index=train_index[0:1001])


# プロット表示
scatterPlot(X_train_mds, y_train, "Multidimensional Scaling")


# 3.8 LLE(局所線形埋め込み) ------------------------------------------------------------
예제 #19
0
import numpy as np
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from pathlib import Path
from matplotlib import pyplot as plt, rcParams

root = Path('doc_matrices')
paths = list(root.glob('**/*.npy'))

mds = MDS(eps=0.0001, max_iter=3000, n_jobs=3, metric=False)
tsne = TSNE(n_components=2,
            init='random',
            method='exact',
            perplexity=7,
            early_exaggeration=100,
            n_iter=3500)

projectors = [tsne]
y = '#F0BE41'
b = '#5383EC'
r = '#D85040'
g = '#58A55C'
w = '#D8DCD6'
colours = [
    g, r, r, r, y, b, g, r, y, y, r, r, b, y, w, y, y, r, r, r, b, y, w, b
]

plt.close('all')
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(3.33, 3.33 * 0.8), dpi=220)

rcParams['font.sans-serif'] = [
예제 #20
0
def cluster_run(path):

    df = pd.DataFrame(columns=['File_Name', 'Content'])
    companies = []
    indX = []
    os.chdir(path)
    for file in glob.glob("*.pdf"):

        raw = parser.from_file(file)
        text = raw['content']
        text = text.replace('\n', ' ')
        text = text.replace('\t', ' ')
        text = text.replace('\r', ' ')
        text = text.replace('\xa0', ' ')
        text = text.lower()
        companies.append(file)
        df1 = {'File_Name': file, 'Content': text[0:30000]}
        df = df.append(df1, ignore_index=True)

    for i in range(0, len(companies)):
        indX.append(i)

    df.to_csv('pdf_Files_Details1.csv', encoding='utf-8', index=False)

    df = pd.read_csv("pdf_Files_Details1.csv")

    text = []
    for i in df['Content']:
        text.append(BeautifulSoup(i, 'html.parser').getText())

    stopwords = nltk.corpus.stopwords.words('english')

    stemmer = SnowballStemmer("english")

    def tokenize_and_stem(text):
        # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
        tokens = [
            word for sent in nltk.sent_tokenize(text)
            for word in nltk.word_tokenize(sent)
        ]
        filtered_tokens = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)
        stems = [stemmer.stem(t) for t in filtered_tokens]
        return stems

    def tokenize_only(text):
        # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
        tokens = [
            word.lower() for sent in nltk.sent_tokenize(text)
            for word in nltk.word_tokenize(sent)
        ]
        filtered_tokens = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)
        return filtered_tokens

    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for q in text:
        allwords_stemmed = tokenize_and_stem(q)
        totalvocab_stemmed.extend(allwords_stemmed)

        allwords_tokenized = tokenize_only(q)
        totalvocab_tokenized.extend(allwords_tokenized)

    vocab_frame = pd.DataFrame({'words': totalvocab_tokenized},
                               index=totalvocab_stemmed)

    words = vocab_frame['words']
    words = words.tolist()

    tfidf_vectorizer = TfidfVectorizer(max_df=0.8,
                                       max_features=200000,
                                       min_df=0.2,
                                       stop_words='english',
                                       use_idf=True,
                                       tokenizer=tokenize_and_stem,
                                       ngram_range=(1, 3))

    tfidf_matrix = tfidf_vectorizer.fit_transform(text)

    terms = tfidf_vectorizer.get_feature_names()
    dist = 1 - cosine_similarity(tfidf_matrix)

    #K-Means clustering

    num_clusters = 4
    km = KMeans(n_clusters=num_clusters)

    km.fit(tfidf_matrix)

    clusters = km.labels_.tolist()

    joblib.dump(km, 'doc_cluster.pkl')
    km = joblib.load('doc_cluster.pkl')
    clusters = km.labels_.tolist()

    findata = {
        'companies': companies,
        'index': indX,
        'text': text,
        'cluster': clusters
    }

    frame = pd.DataFrame(findata,
                         index=[clusters],
                         columns=['index', 'text', 'cluster', 'companies'])

    frame = frame.sort_values(by='index')

    MDS()

    # two components as we're plotting points in a two-dimensional plane
    # "precomputed" because we provide a distance matrix
    # we will also specify `random_state` so the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

    pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

    xs, ys = pos[:, 0], pos[:, 1]

    #strip any proper nouns (NNP) or plural proper nouns (NNPS) from a text

    def strip_proppers_POS(text):
        tagged = pos_tag(text.split())  #use NLTK's part of speech tagger
        non_propernouns = [
            word for word, pos in tagged if pos != 'NNP' and pos != 'NNPS'
        ]
        return non_propernouns

    #set up colors per clusters using a dict
    cluster_colors = {
        0: '#1b9e77',
        1: '#d95f02',
        2: '#7570b3',
        3: '#e7298a',
        4: '#66a61e'
    }

    #set up cluster names using a dict
    cluster_names = {
        0: 'Cluster 0',
        1: 'Cluster 1',
        2: 'Cluster 2',
        3: 'Cluster 3'
    }

    #create data frame that has the result of the MDS plus the cluster numbers and titles
    data = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=companies))

    #group by cluster
    groups = data.groupby('label')

    # set up plot
    fig, ax = plt.subplots(figsize=(17, 9))  # set size
    ax.margins(0.05)  # Optional, just adds 5% padding to the autoscaling

    #iterate through groups to layer the plot
    #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
    for name, group in groups:
        ax.plot(group.x,
                group.y,
                marker='o',
                linestyle='',
                ms=12,
                label=cluster_names[name],
                color=cluster_colors[name],
                mec='none')
        ax.set_aspect('auto')
        ax.tick_params(\
            axis= 'x',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelbottom='off')
        ax.tick_params(\
            axis= 'y',         # changes apply to the y-axis
            which='both',      # both major and minor ticks are affected
            left='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelleft='off')

    ax.legend(numpoints=1)  #show legend with only 1 point

    #add label in x,y position with the label as the film title
    for i in range(len(data)):
        ax.text(data.iloc[i]['x'],
                data.iloc[i]['y'],
                data.iloc[i]['title'],
                size=8)

    p = os.path.abspath(
        r'C:\Users\Rohan.Gupta.USNIIT-TECH\Downloads\LeaseModel\templates\plot.html'
    )

    clust = mpld3.save_html(fig, p)
def clust(num_clusters, tfidf_matrix, dist, titles, ranks, sum_all):
    km = KMeans(n_clusters=num_clusters)
    km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()
    films = {
        'title': titles,
        'rank': ranks,
        'synopsis': sum_all,
        'cluster': clusters
    }
    frame = pd.DataFrame(films,
                         index=[clusters],
                         columns=['title', 'rank', 'cluster'])
    print(frame['cluster'].value_counts())
    print("\n")
    grouped = frame['rank'].groupby(
        frame['cluster'])  #groupby cluster for aggregation purposes
    print(grouped.mean())

    print("Top terms per cluster:")
    print()
    #sort cluster centers by proximity to centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    for i in range(num_clusters):
        print("Cluster %d words:" % i, end='')

        for ind in order_centroids[i, :6]:  #replace 6 with n words per cluster
            try:
                print(' %s' % vocab_frame.loc[terms[ind].split(
                    ' ')].values.tolist()[0][0].encode('utf-8', 'ignore'),
                      end=',')
            except:
                pass
        print()  #add whitespace
        print()  #add whitespace

#       print("Cluster %d titles:" % i, end='')
#       for title in frame.loc[i]['title'].values.tolist():
#           print(' %s,' % title, end='')
#       print() #add whitespace
#       print() #add whitespace

    MDS()
    # convert two components as we're plotting points in a two-dimensional plane
    # "precomputed" because we provide a distance matrix
    # we will also specify `random_state` so the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

    pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

    xs, ys = pos[:, 0], pos[:, 1]
    print()
    print()

    #set up colors per clusters using a dict
    cluster_colors = {
        0: '#1b9e77',
        1: '#d95f02',
        2: '#7570b3',
        3: '#e7298a',
        4: '#66a61e'
    }
    #set up cluster names using a dict
    cluster_names = {0: '1', 1: '2', 2: '3', 3: '4', 4: '5'}
    #some ipython magic to show the matplotlib plots inline
    get_ipython().run_line_magic('matplotlib', 'inline')
    #create data frame that has the result of the MDS plus the cluster numbers and titles
    df = pd.DataFrame(
        dict(x=xs, y=ys, label=clusters, title=titles, rank=ranks))
    #group by cluster
    groups = df.groupby('label')
    # set up plot
    fig, ax = plt.subplots(figsize=(17, 9))  # set size
    ax.margins(0.05)  # Optional, just adds 5% padding to the autoscaling
    #iterate through groups to layer the plot
    #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
    for name, group in groups:
        ax.plot(group.x,
                group.y,
                marker='o',
                linestyle='',
                ms=12,
                label=cluster_names[name],
                color=cluster_colors[name],
                mec='none')
        ax.set_aspect('auto')
        ax.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            bottom='off',  # ticks along the bottom edge are off
            top='off',  # ticks along the top edge are off
            labelbottom='off')
        ax.tick_params(
            axis='y',  # changes apply to the y-axis
            which='both',  # both major and minor ticks are affected
            left='off',  # ticks along the bottom edge are off
            top='off',  # ticks along the top edge are off
            labelleft='off')

    ax.legend(numpoints=1)  #show legend with only 1 point

    #add label in x,y position with the label as the film title
    for i in range(len(df)):
        ax.text(df.loc[i]['x'], df.loc[i]['y'], df.loc[i]['title'], size=8)
    plt.show()  #show the plot
예제 #22
0
    'SpectralEmbedding', 'PCA'
]
doc_top = np.load('doc_top.npy')
top_word = np.load('top_word.npy')
fig_doc = plt.figure(1)
fig_doc.suptitle('Manifold Learning for document in NIPS(after LDA)')
fig_word = plt.figure(2)
fig_word.suptitle('Manifold Learning for word in NIPS(after LDA)')
for i, method in enumerate(methods):
    print('{} starts......'.format(method))
    if method in LLE_dict.keys():
        manifold = LocallyLinearEmbedding(n_neighbors=6,
                                          method=LLE_dict[method],
                                          eigen_solver='dense')
    elif method == 'MDS':
        manifold = MDS(n_init=1, max_iter=100)
    elif method == 'PCA':
        manifold = PCA(n_components=2)
    else:
        exec('manifold={}(n_neighbors=6)'.format(method))
    doc_2D = manifold.fit_transform(doc_top)
    np.save('doc_2d_{}'.format(method), doc_2D)
    word_2D = manifold.fit_transform(top_word.T)
    np.save('word_2d_{}'.format(method), word_2D)
    ax = fig_doc.add_subplot(241 + i)
    ax.scatter(doc_2D[:, 0], doc_2D[:, 1], s=1)
    ax.set_title(method)
    ax.xaxis.set_major_formatter(NullFormatter())
    ax.yaxis.set_major_formatter(NullFormatter())
    ax.axis('tight')
    ax = fig_word.add_subplot(241 + i)
예제 #23
0
    def __plot_samples__(self, dfs, fold):
        """
        :type dfs: List[pandas DataFrame]      # [training df, testing df]
        :type fold: int
        :rtype: None
        """

        mds = MDS(n_components=2,
                  max_iter=3000,
                  eps=1e-9,
                  dissimilarity='euclidean',
                  n_jobs=-1)
        tsne = TSNE(n_components=2)

        # change label to color index
        #   author 1 train (0 = light blue), author 1 test (1 = dark blue)
        #   author 2 train (2 = light green), author 2 test (3 = dark green)
        df_all = pd.DataFrame(columns=dfs[0].columns)
        df0_copy = dfs[0].copy()
        df0_copy.loc[(df0_copy.label == 1).values, 'label'] = 0
        df0_copy.loc[(df0_copy.label == -1).values, 'label'] = 2
        df_all = df_all.append(df0_copy)

        df1_copy = dfs[1].copy()
        df1_copy.loc[(df1_copy.label == 1).values, 'label'] = 1
        df1_copy.loc[(df1_copy.label == -1).values, 'label'] = 3
        df_all = df_all.append(df1_copy)

        legend = {
            0: 'Author 1 Training Sample',
            1: 'Author 1 Test Sample',
            2: 'Author 2 Training Sample',
            3: 'Author 2 Test Sample'
        }

        # fit on training data
        pos_lst = [('Multi-Dimensional Scaling (MDS)',
                    mds.fit(df_all.drop('label', axis=1)).embedding_),
                   ('t-Distributed Stochastic Neighbor Embedding (TSNE)',
                    tsne.fit(df_all.drop('label', axis=1)).embedding_)]

        # plot
        colors = sns.color_palette('Paired', 4)
        fig = plt.figure(figsize=(16, 7))

        plt.hold(True)
        for k, (title, pos) in enumerate(pos_lst, 1):

            ## fig.add_subplot() works in ipython notebook but creates a
            ## mysterious 3rd axes in python...
            # ax = fig.add_subplot(1,2,k)

            ax = plt.subplot(1, 2, k)
            ax.set_title(title)

            for i in xrange(len(colors)):
                samples = pos[(df_all.label == i).values, :]
                ax.scatter(samples[:, 0],
                           samples[:, 1],
                           c=colors[i],
                           edgecolor='none',
                           label=legend[i])
            ax.legend()

        plt.hold(False)

        plt.savefig('../figs/' + \
                   self.__PG_STATS_TBL__[self.__PG_STATS_TBL__.find("_")+1:] + \
                   'fold' + str(fold) + '.png',
                   dpi=300, transparent=True)

        plt.close(fig)
예제 #24
0
def tweet_clusters():
    docs = read_tweet_docs()
    cv = TfidfVectorizer(stop_words=tfidf_stop_words,
                         ngram_range=(1, 2),
                         min_df=0.1,
                         max_df=0.7)
    tfidf_matrix = cv.fit_transform(docs)
    terms = cv.get_feature_names()

    dist = 1 - sk_cos_sim(tfidf_matrix)

    num_clusters = 10
    km = KMeans(n_clusters=num_clusters)
    km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()
    print(clusters)
    senators = {'name': twitterhandles, 'cluster': clusters}
    frame = pd.DataFrame(senators,
                         index=[clusters],
                         columns=['name', 'cluster'])
    print(frame.head())

    print("Top terms per cluster:")
    print()
    #sort cluster centers by proximity to centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    cluster_array = []
    for i in range(num_clusters):
        cluster = {"words": [], "senators": []}
        #print("Cluster %d words:" % i, end='')

        for ind in order_centroids[
                i, :10]:  #replace 6 with n words per cluster
            #print(' %s' % terms[ind], end=',')
            cluster["words"].append(terms[ind])
        print()  #add whitespace
        print()  #add whitespace

        #print("Cluster %d names:" % i, end='')
        for title in frame.loc[i]['name'].values.tolist():
            #print(' %s,' % title, end='')
            cluster["senators"].append(title)

        cluster_array.append(cluster)
        #print(json.dumps(cluster))
        print()  #add whitespace
        print()  #add whitespace

    print()
    print()
    print(json.dumps(cluster_array))
    cluster_file = open("cluster_file.json", "w")
    cluster_file.write(json.dumps(cluster_array))
    cluster_file.close()

    #set up colors per clusters using a dict
    cluster_colors = {
        0: '#1b9e77',
        1: '#d95f02',
        2: '#7570b3',
        3: '#e7298a',
        4: '#66a61e'
    }

    #set up cluster names using a dict
    cluster_names = {
        0: 'Family, home, war',
        1: 'Police, killed, murders',
        2: 'Father, New York, brothers',
        3: 'Dance, singing, love',
        4: 'Killed, soldiers, captain'
    }

    MDS()

    # convert two components as we're plotting points in a two-dimensional plane
    # "precomputed" because we provide a distance matrix
    # we will also specify `random_state` so the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

    pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

    xs, ys = pos[:, 0], pos[:, 1]
    print()
    print()
    #create data frame that has the result of the MDS plus the cluster numbers and titles
    df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=twitterhandles))

    #group by cluster
    groups = df.groupby('label')

    # set up plot
    fig, ax = plt.subplots(figsize=(17, 9))  # set size
    ax.margins(0.05)  # Optional, just adds 5% padding to the autoscaling

    #iterate through groups to layer the plot
    #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
    for name, group in groups:
        #ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
        #        label=cluster_names[name], color=cluster_colors[name],
        #        mec='none')
        ax.set_aspect('auto')
        ax.tick_params(\
            axis= 'x',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelbottom='off')
        ax.tick_params(\
            axis= 'y',         # changes apply to the y-axis
            which='both',      # both major and minor ticks are affected
            left='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelleft='off')

    ax.legend(numpoints=1)  #show legend with only 1 point

    #add label in x,y position with the label as the film title
    for i in range(len(df)):
        ax.text(df.loc[i]['x'], df.loc[i]['y'], df.loc[i]['title'], size=8)
예제 #25
0
plt.show()

# calculated distance matrix

from sklearn.metrics import pairwise_distances
D = pairwise_distances(X)
print(D.shape)

plt.imshow(D, zorder=2, cmap='Blues', interpolation='nearest')
plt.colorbar()
plt.show()

# MDS: distance matrix ---- coordinate representation

from sklearn.manifold import MDS
model = MDS(n_components=2, dissimilarity='precomputed', random_state=1)
out = model.fit_transform(D)
plt.scatter(out[:, 0], out[:, 1], **colorize)
plt.axis('equal')
plt.show()

print('#---------------------------------#')
print('    MDS as Manifold learning       ')
print('#---------------------------------#')
print("\n")


def random_projection(X, dimension=3, rseed=42):
    assert dimension >= X.shape[1]
    rng = np.random.RandomState(rseed)
    C = rng.randn(dimension, dimension)
예제 #26
0
                ],
                [
                    0.583399634410048, 0.419747266788418, 1, 0.568688414138699,
                    0.522467346811204, 0.397952306807963
                ],
                [
                    0.592856152221665, 0.412688190059770, 0.568688414138699, 1,
                    0.502786232321456, 0.399008249004968
                ],
                [
                    0.539470828139999, 0.386710678519393, 0.522467346811204,
                    0.502786232321456, 1, 0.351377435042073
                ],
                [
                    0.387598235354629, 0.432336134790601, 0.397952306807963,
                    0.399008249004968, 0.351377435042073, 1
                ]])

# dissimilarity is 1 minus similarity
dissimilarities = 1 - S

# compute the embedding
coord = MDS(dissimilarity='precomputed').fit_transform(dissimilarities)

plt.scatter(coord[:, 0], coord[:, 1])

# Label the points
for i in range(coord.shape[0]):
    plt.annotate(str(i), (coord[i, :]))

plt.show()
예제 #27
0
def dimension_reduction(data, index, method='tsne', label=None, plot=False):
    n_components = 2

    # 所有降维方法都是基于距离的,需要保证特征距离标准化
    scaler = StandardScaler().fit(data)
    data = scaler.transform(data)

    # 大多数情况下,用tsne 将高纬度数据用二维方式展示出来。不同方法采用不同的特征映射方法计算出
    # 不同的X,用fittransform方法进行标准化,这里是最小-最大规范化
    if method == 'tsne':
        model = TSNE(n_components=n_components,
                     perplexity=20,
                     early_exaggeration=100.0,
                     method='exact',
                     learning_rate=100,
                     n_iter=1000,
                     random_state=250,
                     verbose=2)
        X = model.fit_transform(data)  # X是两列数据,经过了聚类+规范化
    if method == 'isomap':
        model = Isomap(n_components=n_components, n_neighbors=20)
        X = model.fit_transform(data)
    if method == 'MDS':
        model = MDS(n_components=n_components,
                    verbose=2,
                    n_init=1,
                    max_iter=500)
        X = model.fit_transform(data)
    if method == 'tsne_v2':
        X = tsne(data, 2, 44, 50.0)

    data_len = len(X)  # 统计X长度
    print(data_len)  # data_len = 1653
    print(X)  # 二维数组,(1653L,2L)
    if plot:
        fig, ax = plt.subplots()  # 说明有几个子图,数量未定
        # plt.subplot(2, 1, 1)#面板设置成2行1列,并取第一个(顺时针编号)
        # plt.plot(x1, y1, 'yo-')#画图,染色
        #        plt.scatter(X[label==0,0],X[label==0,1],c='darkblue',alpha=0.25,marker='^')
        #        plt.scatter(X[label==1,0],X[label==1,1],c='darkred',alpha=0.75,marker='x')
        #        plt.scatter(X[label==2,0],X[label==2,1],c='green',alpha=0.25,marker='o')
        #        plt.xlim([np.min(X[label==0,0]),np.max(X[label==0,0])])
        #        plt.ylim([np.min(X[label==0,1]),np.max(X[label==0,1])])
        ax.scatter(X[label == 0, 0],
                   X[label == 0, 1],
                   c='darkblue',
                   alpha=0.25,
                   marker='^')
        ax.scatter(X[label == 1, 0],
                   X[label == 1, 1],
                   c='darkred',
                   alpha=0.75,
                   marker='x')
        ax.scatter(X[label == 2, 0],
                   X[label == 2, 1],
                   c='green',
                   alpha=0.25,
                   marker='o')
        ax.set_xlim([np.min(X[label == 0, 0]), np.max(X[label == 0, 0])])
        ax.set_ylim([np.min(X[label == 0, 1]), np.max(X[label == 0, 1])])
        idxList = []
        nameList = []
        for i, ind in enumerate(index):
            if not ((-20 < X[ind, 0] < 20) and (-20 < X[ind, 1] < 20)):
                print(ind)
                idxList.append(ind)
                nameList.append(name[ind])
                # plt.annotate('This is awesome!', xy=(76, 0.75),
                ax.annotate(str(ind), xy=(X[ind, 0], X[ind, 1]))
                #
                # ax.annotate(str(ind), X[ind, 0], X[ind, 1])
        print idxList
        print nameList
        plt.show()
        outPut = {'Index': idxList, 'Video_Name': nameList}
        print outPut
        output_Archive = pd.DataFrame(outPut)
        output_Archive.to_csv('output_Archive.csv')

    return X
예제 #28
0
 def __init__(self, dimensionality=2500, seed=None):
     rnd_state = np.random.RandomState(seed=seed)
     self.mds = MDS(n_components=dimensionality,
                    n_jobs=-1,
                    random_state=rnd_state,
                    dissimilarity="precomputed")
예제 #29
0
def plotPCA(df, true_k, clusters, X, english=False):
    # Plot in 2d with PCA
    dist = 1 - cosine_similarity(X)

    MDS()
    # convert two components as we're plotting points in a two-dimensional plane
    # "precomputed" because we provide a distance matrix
    # we will also specify `random_state` so the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

    pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

    xs, ys = pos[:, 0], pos[:, 1]

    import matplotlib.cm as cm

    # set up colors per clusters using a dict
    cluster_colors = cm.rainbow(np.linspace(0, 1, true_k))

    # set up cluster names using a dict
    # cluster_names = {i: 'i' for i in range(true_k)}

    # create data frame that has the result of the MDS plus the cluster
    # numbers and titles
    df2 = pd.DataFrame(
        dict(x=xs, y=ys, label=clusters, title=df[0], title2=df[2]))

    # group by cluster
    groups = df2.groupby('label')

    pd.set_option('display.max_rows', len(df2))
    # print(df2.sort_values(by='label')[['label', 'title', 'title2']])

    filename = './labels.%s.csv' % ('en' if english else 'es')

    df2.sort_values(by='label')[['label', 'title', 'title2']].to_csv(filename)
    pd.reset_option('display.max_rows')

    # set up plot
    fig, ax = plt.subplots(figsize=(25, 25))  # set size
    ax.margins(0.05)  # Optional, just adds 5% padding to the autoscaling

    # iterate through groups to layer the plot
    # note that I use the cluster_name and cluster_color dicts with the 'name'
    # lookup to return the appropriate color/label
    for name, group in groups:
        ax.plot(group.x,
                group.y,
                marker='o',
                linestyle='',
                ms=12,
                color=cluster_colors[name],
                mec='none')
        ax.set_aspect('auto')
        ax.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            bottom='off',  # ticks along the bottom edge are off
            top='off',  # ticks along the top edge are off
            labelbottom='off')
        ax.tick_params(
            axis='y',  # changes apply to the y-axis
            which='both',  # both major and minor ticks are affected
            left='off',  # ticks along the bottom edge are off
            top='off',  # ticks along the top edge are off
            labelleft='off')

    # ax.legend(numpoints=1)  #show legend with only 1 point

    # add label in x,y position with the label as the film title

    for i in range(len(df2)):
        ax.text(df2.ix[i]['x'], df2.ix[i]['y'], df2.ix[i]['title'], size=4)

    # plt.show() # show the plot
    # plt.savefig('test.pdf', format='pdf')  # , dpi=600)
    # plt.savefig('test.eps', format='eps')  # , dpi=600)
    # plt.savefig('clusters_small_noaxes.png')  # , dpi=600)
    plt.close()

    class TopToolbar(mpld3.plugins.PluginBase):
        """Plugin for moving toolbar to top of figure"""

        JAVASCRIPT = """
        mpld3.register_plugin("toptoolbar", TopToolbar);
        TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);
        TopToolbar.prototype.constructor = TopToolbar;
        function TopToolbar(fig, props){
            mpld3.Plugin.call(this, fig, props);
        };

        TopToolbar.prototype.draw = function(){
          // the toolbar svg doesn't exist
          // yet, so first draw it
          this.fig.toolbar.draw();

          // then change the y position to be
          // at the top of the figure
          this.fig.toolbar.toolbar.attr("x", 150);
          this.fig.toolbar.toolbar.attr("y", 400);

          // then remove the draw function,
          // so that it is not called again
          this.fig.toolbar.draw = function() {}
        }
        """

        def __init__(self):
            self.dict_ = {"type": "toptoolbar"}

    # create data frame that has the result of the MDS plus the cluster
    # numbers and titles
    df3 = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=df[0]))

    # group by cluster
    groups = df3.groupby('label')

    # define custom css to format the font and to remove the axis labeling
    css = """
    text.mpld3-text, div.mpld3-tooltip {
      font-family:Arial, Helvetica, sans-serif;
    }

    g.mpld3-xaxis, g.mpld3-yaxis {
    display: none; }

    svg.mpld3-figure {
    margin-left: 200px;}
    """

    # Plot
    fig, ax = plt.subplots(figsize=(25, 25))  # set plot size
    ax.margins(0.03)  # Optional, just adds 5% padding to the autoscaling

    # iterate through groups to layer the plot
    # note that I use the cluster_name and cluster_color dicts with the 'name'
    # lookup to return the appropriate color/label
    for name, group in groups:
        points = ax.plot(group.x,
                         group.y,
                         marker='o',
                         linestyle='',
                         ms=18,
                         mec='none',
                         color=cluster_colors[name])
        ax.set_aspect('auto')
        labels = [i for i in group.title]

        # set tooltip using points, labels and the already defined 'css'
        tooltip = mpld3.plugins.PointHTMLTooltip(points[0],
                                                 labels,
                                                 voffset=10,
                                                 hoffset=10,
                                                 css=css)
        # connect tooltip to fig
        mpld3.plugins.connect(fig, tooltip, TopToolbar())

        # set tick marks as blank
        ax.axes.get_xaxis().set_ticks([])
        ax.axes.get_yaxis().set_ticks([])

        # set axis as blank
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)

    ax.legend(numpoints=1)  # show legend with only one dot

    mpld3.display()  # show the plot

    # uncomment the below to export to html
    html = mpld3.fig_to_html(fig)
    name = 'name.%s.html' % ('en' if english else 'es')
    mpld3.save_html(fig, name)
예제 #30
0
def get_embedding(dm, dim):

    # Return embedding space
    return MDS(n_components=dim, metric=True,
               dissimilarity="precomputed").fit_transform(dm)