Exemplo n.º 1
0
def test_init_ndarray_precomputed():
    # Initialize TSNE with ndarray and metric 'precomputed'
    # Make sure no FutureWarning is thrown from _fit
    tsne = TSNE(init=np.zeros((100, 2)),
                metric="precomputed",
                square_distances=True)
    tsne.fit(np.zeros((100, 100)))
Exemplo n.º 2
0
def pcAnalysis(X, Xtest, w=None, ncomp=2, useTSNE=False):
    """
    PCA(TSNE
    """
    if useTSNE:
        print "TSNE analysis for train/test"
        pca = TSNE(n_components=ncomp)
    else:
        print "PC analysis for train/test"
        pca = TruncatedSVD(n_components=ncomp)
    print pca

    pca.fit(X)
    X_all = pd.concat([Xtest, X])
    X_r = pca.transform(X_all.values)
    plt.scatter(X_r[len(Xtest.index):, 0],
                X_r[len(Xtest.index):, 1],
                c='r',
                label="train",
                alpha=0.5)
    plt.scatter(X_r[:len(Xtest.index), 0],
                X_r[:len(Xtest.index), 1],
                c='g',
                label="test",
                alpha=0.5)
    print("Total variance:", np.sum(pca.explained_variance_ratio_))
    print("Explained variance:", pca.explained_variance_ratio_)
    plt.legend()
    plt.show()
Exemplo n.º 3
0
def tsne_plot(gwbow, corp_dic):
    corp_sr = pd.Series(corp_dic)
    #300*60次元あるベクトルをt-sneで2次元へ
    tsne_model = TSNE(n_components=2, random_state=0, verbose=2)
    np.set_printoptions(suppress=True)  #指数表記を禁止にして常に小数で表示
    tsne_model.fit(gwbow)
    # 散布図の表示
    skip = 0
    limit = 4100
    plain_tsne = pd.DataFrame(tsne_model.embedding_[skip:limit, 0],
                              columns=["x"])
    plain_tsne["y"] = pd.DataFrame(tsne_model.embedding_[skip:limit, 1])
    plain_tsne['corp_name'] = corp_sr
    df_edinetcode = pd.read_csv('EdinetcodeDlInfo.csv',
                                encoding='cp932',
                                header=1,
                                index_col=0)
    df_merge = pd.merge(plain_tsne,
                        df_edinetcode,
                        left_on='corp_name',
                        right_on='EDINETコード')
    df_tsne = df_merge[['x', 'y', '提出者名']].copy()
    ax = df_tsne.plot.scatter(x="x", y="y", figsize=(10, 10), s=30)
    #各要素にラベルを表示
    for k, v in df_tsne.iterrows():
        ax.annotate(v[2], xy=(v[0], v[1]), size=15)
Exemplo n.º 4
0
    def project_tsne(self, projection_attrs):
        data = self._A_matrix

        if (projection_attrs):
            if (projection_attrs['pca']):
                pca = projection_attrs['pca']

            if (projection_attrs['perplexity']):
                perplexity = projection_attrs['perplexity']

            if (projection_attrs['theta']):
                theta = projection_attrs['theta']
        else:  # Standard configuration
            perplexity = 30.0
            theta = 0.5
            pca = False

        if (pca and data.shape[0] > 50):
            pca = PCA(n_components=50)
            pca.fit(data.T)
            data = pca.components_[0:50, :].T

        tsne = TSNE(n_components=2,
                    perplexity=perplexity,
                    method='barnes_hut',
                    angle=theta,
                    learning_rate=1000)
        tsne.fit(data)
        tsne_proj = tsne.embedding_[:, 0:2]

        return (tsne_proj)
Exemplo n.º 5
0
class TSNE( BaseDR ):
    '''
    tSNE
    '''

    def __init__( self, n_components=2, perplexity=30.0 ):
        super( self.__class__, self ).__init__( n_components, Alg.TSNE, True )

        from sklearn.manifold import TSNE
        self.tsne = TSNE( n_components=n_components, perplexity=perplexity )

    def fit( self, X ):
        self.tsne.fit( X )
        return self.tsne

    def transform( self, X ):
        return None

    def fit_transform( self, X ):
        return self.tsne.fit_transform( X )

    def inverse_transform( self, A ):
        return None

    def project( self, X ):
        return None
def performDimensionalityReduction(context_vector, n_component, perplexity):
    '''
        Applies TSNE on the feature vector of each of the word instances and creates
        one model for each word type
    '''
    feature_vector_data = defaultdict(dict)
    word_type_model     = {}
    
    for word_type, word_type_data in context_vector.iteritems():
        feature_vector_word_type = OrderedDict()
        
        #Reading in all the feature vectors for the given word type
        for data_type, instance_details in word_type_data.iteritems():
            for instance, context_details in instance_details.iteritems():
                
                #Training data with have the sense id's while test data will have ['<UNKNOWN>']
                senses = context_details.get('Sense')
                for sense in senses:
                    feature_vector_word_type[(instance, sense, data_type)] = context_details["Feature_Vector"]
        
        #Applying TSNE on all the feature vectors
        feature_vector_array = np.array(feature_vector_word_type.values())
        model = TSNE(n_components=n_component, random_state=0, perplexity=perplexity, metric="cosine")
        model.fit(feature_vector_array)
        
        #Storing the model since it will be needed to fit the test data
        word_type_model[word_type] = model
        
        #Converting to a structure of {WordType: {(instanceID, senseID): FeatureVector ... }}
        for i in range(len(feature_vector_word_type)):
            feature_vector_data[word_type][feature_vector_word_type.keys()[i]] = list(model.embedding_[i])

    return feature_vector_word_type, word_type_model
Exemplo n.º 7
0
def perform_tSNE_analys(n_samples=10e10,
                        n_variables=10000,
                        data_type='psi',
                        filter_tissues=True,
                        n_dimensions=2,
                        perplexity=30,
                        learning_rate=200,
                        n_iter=1000):
    """ Performs the tSNE of the PSI/TPM values. It is used to visualize high-dimensional data, converting affinities of data points to probabilities using t-Students distributions."""
    data, labels = read_psi_and_recover_tissue(n_samples=n_samples,
                                               n_variables=10000,
                                               data_type=data_type,
                                               filter_tissues=filter_tissues)
    X_train, y_train = generate_sets(data, labels, do_not_split=True)
    tsne = TSNE(n_components=n_dimensions,
                perplexity=perplexity,
                early_exaggeration=12.0,
                learning_rate=learning_rate,
                n_iter=n_iter,
                n_iter_without_progress=300,
                min_grad_norm=1e-07,
                metric='euclidean',
                init='random',
                verbose=1,
                random_state=None)
    tsne.fit(X_train.values)
    results = tsne.embedding_
    results = pandas.DataFrame(
        results,
        columns=[str(x) + 'D' for x in range(1, n_dimensions + 1)],
        index=y_train.index)
    results = pandas.concat([results, y_train.idxmax(1)], axis=1)
    results = results.rename(columns={0: 'Tissue'})
    plot_by_group(results.groupby('Tissue'), '1D', '2D')
Exemplo n.º 8
0
def tSNE_method(axes, user_xaxis, user_yaxis, clusters):
    """
    Навчання за методом t-SNE та підготовка результатів для друку на виводу графіку
    """
    # Визначення моделі та швидкості навчання
    model = TSNE()

    # навчання моделі
    transformed = model.fit_transform(iris_df.data)

    model = KMeans(n_clusters=clusters)
    model.fit(transformed)

    # Передбачення на всьому наборі даних
    all_predictions = model.predict(transformed)

    # Розділення набору даних
    x_axis = transformed[:, user_xaxis]
    y_axis = transformed[:, user_yaxis]

    axes[1][0].scatter(x_axis, y_axis, c=all_predictions)
    axes[1][0].set_xlabel('Метод К-середніх зі зменш. розм.')

    return 'Передбачені міткі (Метод К-cередніх зі зменш. розм.):\n {}'.format(
        all_predictions)
Exemplo n.º 9
0
def k_means(data_set, output_file, png_file, t_labels, score_file, set_name):
    model = cluster.KMeans(n_clusters=4,
                           max_iter=100,
                           n_jobs=4,
                           init="k-means++")
    model.fit(data_set)
    # print(list(model.labels_))
    p_labels = list(model.labels_)
    r = pd.concat(
        [data_set, pd.Series(model.labels_, index=data_set.index)], axis=1)
    r.columns = list(data_set.columns) + [u'聚类类别']
    print(r)
    r.to_excel(output_file)
    with open(score_file, "a") as sf:
        sf.write("By k-means, the f-m_score of " + set_name + " is: " +
                 str(metrics.fowlkes_mallows_score(t_labels, p_labels)) + "\n")
        sf.write("By k-means, the rand_score of " + set_name + " is: " +
                 str(metrics.adjusted_rand_score(t_labels, p_labels)) + "\n")
    t_sne = TSNE()
    t_sne.fit(data_set)
    t_sne = pd.DataFrame(t_sne.embedding_, index=data_set.index)
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    dd = t_sne[r[u'聚类类别'] == 0]
    plt.plot(dd[0], dd[1], 'r.')
    dd = t_sne[r[u'聚类类别'] == 1]
    plt.plot(dd[0], dd[1], 'go')
    dd = t_sne[r[u'聚类类别'] == 2]
    plt.plot(dd[0], dd[1], 'b*')
    dd = t_sne[r[u'聚类类别'] == 3]
    plt.plot(dd[0], dd[1], 'o')
    plt.savefig(png_file)
    plt.clf()
def S_cached_kl (perp, X=np.array([])):
	tsne = TSNE(perplexity=perp, random_state=42)
	t0 = time.perf_counter()
	tsne.fit(X)
	t1 = time.perf_counter()
	print('Last t-SNE took {} seconds'.format(t1-t0))
	n = X.shape[0]
	return 2*tsne.kl_divergence_ + (math.log(n)*perp/n)
Exemplo n.º 11
0
def test_init_ndarray_precomputed():
    # Initialize TSNE with ndarray and metric 'precomputed'
    # Make sure no FutureWarning is thrown from _fit
    tsne = TSNE(
        init=np.zeros((100, 2)),
        metric="precomputed",
        learning_rate=50.0,
    )
    tsne.fit(np.zeros((100, 100)))
Exemplo n.º 12
0
 def Tsne(self,):
     data_set=pd.read_csv(self.data_set_name,header=None,index_col=None)
     data_set=data_set.T   
     tsne=TSNE(n_components=self.components)
     tsne.fit(data_set)
     data_set=tsne.fit_transform(data_set)
     print("Generate Dre_data.csv." )
     data_set=pd.DataFrame(data_set)
     data_set.to_csv(self.Dred_data,header=False,index=False)
     return 0
Exemplo n.º 13
0
def drawing(word_vector, word_dict):
    tsne = TSNE(n_components=2)
    tsne.fit(word_vector[0:1000, :])
    word_embedding = tsne.embedding_
    print word_embedding.shape
    fig = plt.figure()
    for idx in range(word_embedding.shape[0]) :
        plt.plot(word_embedding[idx,0], word_embedding[idx,1], 'o-', color='#ef4136')
        plt.text(word_embedding[idx,0], word_embedding[idx,1], word_dict[idx], color='black', ha='left')
    plt.show()
Exemplo n.º 14
0
class Tsne:
    """
    This transformer transformers all vectors in an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet]
    by means of tsne. This implementation uses [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE).

    Important:
        TSNE does not allow you to train a transformation and re-use it. It must retrain every time it sees data.
        You may also notice that it is relatively slow. This unfortunately is a fact of life.

    Arguments:
        n_components: the number of compoments to create/add
        kwargs: keyword arguments passed to the Tsne implementation, includes things like `perplexity` [link](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE)

    Usage:

    ```python
    from whatlies.language import SpacyLanguage
    from whatlies.transformers import Tsne

    words = ["prince", "princess", "nurse", "doctor", "banker", "man", "woman",
             "cousin", "neice", "king", "queen", "dude", "guy", "gal", "fire",
             "dog", "cat", "mouse", "red", "blue", "green", "yellow", "water",
             "person", "family", "brother", "sister"]

    lang = SpacyLanguage("en_core_web_md")
    emb = lang[words]

    emb.transform(Tsne(3)).plot_interactive_matrix('tsne_0', 'tsne_1', 'tsne_2')
    ```
    """
    def __init__(self, n_components=2, **kwargs):
        self.is_fitted = False
        self.n_components = n_components
        self.kwargs = kwargs
        self.tfm = TSNE(n_components=n_components, **kwargs)

    def __call__(self, embset):
        if not self.is_fitted:
            self.fit(embset)
        return self.transform(embset)

    def fit(self, embset):
        names, X = embset_to_X(embset=embset)
        self.tfm.fit(X)
        self.is_fitted = True

    def transform(self, embset):
        names, X = embset_to_X(embset=embset)
        new_vecs = self.tfm.fit_transform(X)
        names_out = names + [f"tsne_{i}" for i in range(self.n_components)]
        vectors_out = np.concatenate([new_vecs, np.eye(self.n_components)])
        new_dict = new_embedding_dict(names_out, vectors_out, embset)
        return EmbeddingSet(new_dict,
                            name=f"{embset.name}.tsne_{self.n_components}()")
Exemplo n.º 15
0
def viz_js_stations(df, manifold='MDS'):

    if manifold == 'TSNE':
        model = TSNE(metric='precomputed')
    elif manifold == 'MDS':
        model = MDS(dissimilarity='precomputed')
    else:
        raise ValueError('Unknown manifold method: {}'.format(manifold))
    model.fit(df.values)

    p = figure()
    p = _axis_adjust(p)
    special_stations_1 = [
        789, 625, 248, 658, 404, 719, 785, 252, 111, 191, 307
    ]
    special_stations_2 = [
        433, 393, 392, 361, 331, 214, 215, 193, 154, 140, 66, 41, 12
    ]
    all_special = special_stations_1 + special_stations_2
    size_mapper = lambda x: 20 if int(x) in all_special else 10
    color_mapper = lambda x: brewer['PRGn'][11][10] if int(x) in special_stations_1 else \
        (brewer['PRGn'][11][0] if int(x) in special_stations_2 else brewer['PRGn'][11][5])
    sizes = [size_mapper(station) for station in df.index]
    colours = [color_mapper(station) for station in df.index]

    source = ColumnDataSource({
        'x': model.embedding_[:, 0],
        'y': model.embedding_[:, 1],
        'station_key': df.index,
        'sizes': sizes,
        'colours': colours
    })

    p.circle(x='x',
             y='y',
             source=source,
             fill_color='colours',
             line_color=brewer['PRGn'][7][0],
             size='sizes',
             fill_alpha=0.6)
    #labels = LabelSet(x='x', y='y', text='station_key', level='glyph',
    #                  x_offset=5, y_offset=5, source=source, render_mode='canvas',
    #                  text_font_size='8px')
    p.add_tools(HoverTool(tooltips=[('station', '@station_key')]))
    p.xaxis.axis_label = 'MDS Embedded Coordinate 1'
    p.yaxis.axis_label = 'MDS Embedded Coordinate 2'
    p.yaxis.major_tick_line_color = None
    p.xaxis.major_tick_line_color = None
    #p.add_layout(labels)
    #    p.add_layout(citation)
    show(p)

    return p
Exemplo n.º 16
0
def tsne(G, vectors):
    vector_list = []
    for key in vectors.keys():
        vector_list.append(vectors[key])
    nodes = list(G.nodes)
    tsne = TSNE(n_components=2)
    tsne.fit(vector_list)
    newX = tsne.fit_transform(vector_list)
    pos = {}
    for i in range(0, len(newX)):
        pos[nodes[i]] = newX[i]
    return pos
Exemplo n.º 17
0
def tSNE_tackle(train_X, n_components):
    #fig = plt.figure('LDA')
    tsne = TSNE(n_components=n_components, verbose=1)
    tsne.fit(train_X)
    X_new = tsne.fit_transform((train_X))
    #print("降维后各主成分的方差值与总方差之比:", tsne.explained_variance_ratio_)
    #print("降维后各主成分的方差值之和:", sum(tsne.explained_variance_ratio_))
    #print("降维前样本数量和维度:",train_X.shape)
    #print("降维后样本数量和维度:",X_new.shape)
    #plt.show()

    return X_new
Exemplo n.º 18
0
def plot_data(args, seq, original_seq=None):
    if args.delta:
        plt.figure()
        dist = np.sum((seq[1:, ...] - seq[:-1, ...])**2, axis=1)**0.5
        plt.hist(dist)
        if args.save:
            plt.savefig(args.save, dpi=120)
        else:
            plt.show()
        return

    if args.pca:
        pca = PCA(n_components=args.pca)
        if original_seq is None:
            seq = pca.fit_transform(seq)
        else:
            original_seq = pca.fit_transform(original_seq)
            seq = pca.transform(seq)

    if args.tsne:
        tsne = TSNE(n_components=2, perplexity=30.0, n_iter=2000, verbose=2)
        if original_seq is None:
            seq = tsne.fit_transform(seq)
        else:
            tsne.fit(original_seq)
            seq = tsne.transform(seq)

    if seq.shape[1] == 2:
        plt.figure()
        x, y = zip(*seq[:, :])
        color_list = cm.get_cmap(name="viridis")
        if args.strip:
            n, m = tuple(args.strip)
            for i in range(0, seq.shape[0] - 1, m):
                plt.plot(x[i:(i + n)],
                         y[i:(i + n)],
                         '-',
                         color=color_list(i / (seq.shape[0] - 1)))
        else:
            for i in range(seq.shape[0] - 1):
                plt.plot(x[i:(i + 2)],
                         y[i:(i + 2)],
                         '.',
                         color=color_list(i / (seq.shape[0] - 1)))
        plt.axis('equal')
        if args.save:
            plt.savefig(args.save, dpi=120)
        else:
            plt.show()
    else:
        print("Cannot plot sequence: data is of size {}".format(seq.shape))
Exemplo n.º 19
0
def viz_js_stations_two(df, manifold='TSNE'):

    if manifold == 'TSNE':
        model = TSNE(metric='precomputed')
    elif manifold == 'MDS':
        model = MDS(dissimilarity='precomputed')
    else:
        raise ValueError('Unknown manifold method: {}'.format(manifold))
    model.fit(df.values)

    p = figure()
    p = _axis_adjust(p)
    source = ColumnDataSource({
        'x': model.embedding_[:, 0],
        'y': model.embedding_[:, 1],
        'station_key': df.index.get_level_values(0),
        'city': df.index.get_level_values(1)
    })
    p.circle(
        x='x',
        y='y',
        source=source,
        fill_color=factor_cmap('city',
                               [brewer['PRGn'][7][0], brewer['PRGn'][7][6]],
                               ['London', 'Taipei']),
        line_color=factor_cmap('city',
                               [brewer['PRGn'][7][0], brewer['PRGn'][7][6]],
                               ['London', 'Taipei']),
        size=10.0,
        fill_alpha=0.6)

    labels = LabelSet(x='x',
                      y='y',
                      text='station_key',
                      level='glyph',
                      x_offset=5,
                      y_offset=5,
                      source=source,
                      render_mode='canvas',
                      text_font_size='6px')
    p.add_tools(HoverTool(tooltips=[('station', '@station_key')]))
    p.xaxis.axis_label = 'MDS Embedded Coordinate 1'
    p.yaxis.axis_label = 'MDS Embedded Coordinate 2'
    p.yaxis.major_tick_line_color = None
    p.xaxis.major_tick_line_color = None
    #p.add_layout(labels)

    return p
Exemplo n.º 20
0
    def _fit_embedding(self, 
                        method = 'tsne',  
                        n_components = 2,
                        random_state = 1,  
                        verbose = 2,
                        n_neighbors = 15,
                        min_dist = 0.1,
                        **kwargs):
        
        """
        parameters
        -----------------
        method: {'tsne', 'umap', 'mds'}, algorithm to embedd high-D to 2D
        kwargs: the extra parameters for the conresponding algorithm
        """
        dist_matrix = self.dist_matrix
        if 'metric' in kwargs.keys():
            metric = kwargs.get('metric')
            kwargs.pop('metric')
            
        else:
            metric = 'precomputed'

        if method == 'tsne':
            embedded = TSNE(n_components=n_components, 
                            random_state=random_state,
                            metric = metric,
                            verbose = verbose,
                            **kwargs)
        elif method == 'umap':
            embedded = UMAP(n_components = n_components, 
                            n_neighbors = n_neighbors,
                            min_dist = min_dist,
                            verbose = verbose,
                            random_state=random_state, 
                            metric = metric, **kwargs)
            
        elif method =='mds':
            if 'metric' in kwargs.keys():
                kwargs.pop('metric')
            if 'dissimilarity' in kwargs.keys():
                dissimilarity = kwargs.get('dissimilarity')
                kwargs.pop('dissimilarity')
            else:
                dissimilarity = 'precomputed'
                
            embedded = MDS(metric = True, 
                           n_components= n_components,
                           verbose = verbose,
                           dissimilarity = dissimilarity, 
                           random_state = random_state, **kwargs)
        
        embedded = embedded.fit(dist_matrix)    

        df = pd.DataFrame(embedded.embedding_, index = self.flist,columns=['x', 'y'])
        typemap = self.bitsinfo.set_index('IDs')
        df = df.join(typemap)
        df['Channels'] = df['Subtypes']
        self.df_embedding = df
        self.embedded = embedded
Exemplo n.º 21
0
def labtest_TSNE(PID):
    data = [patients[pid]['tests'] for pid in PID]
    X = pp.scale(data)
    tsne = TSNE(n_components=2, perplexity=30.0, learning_rate=1000.0, n_iter=1000, n_iter_without_progress=30, min_grad_norm=1e-07, angle=0.5)
    pos = tsne.fit(X).embedding_
    
    return pos
Exemplo n.º 22
0
def test_reduction_to_one_component():
    # t-SNE should allow reduction to one component (issue #4154).
    random_state = check_random_state(0)
    tsne = TSNE(n_components=1)
    X = random_state.randn(5, 2)
    X_embedded = tsne.fit(X).embedding_
    assert(np.all(np.isfinite(X_embedded)))
Exemplo n.º 23
0
    def _fit_embedding(self,
                       dist_matrix,
                       method='umap',
                       n_components=2,
                       random_state=32,
                       verbose=2,
                       n_neighbors=15,
                       min_dist=0.1,
                       **kwargs):
        """
        parameters
        -----------------
        dist_matrix: distance matrix to fit
        method: {'tsne', 'umap', 'mds'}, algorithm to embedd high-D to 2D
        kwargs: the extra parameters for the conresponding algorithm
        """

        if 'metric' in kwargs.keys():
            metric = kwargs.get('metric')
            kwargs.pop('metric')

        else:
            metric = 'precomputed'

        if method == 'tsne':
            embedded = TSNE(n_components=n_components,
                            random_state=random_state,
                            metric=metric,
                            verbose=verbose,
                            **kwargs)
        elif method == 'umap':
            embedded = UMAP(n_components=n_components,
                            n_neighbors=n_neighbors,
                            min_dist=min_dist,
                            verbose=verbose,
                            random_state=random_state,
                            metric=metric,
                            **kwargs)

        elif method == 'mds':
            if 'metric' in kwargs.keys():
                kwargs.pop('metric')
            if 'dissimilarity' in kwargs.keys():
                dissimilarity = kwargs.get('dissimilarity')
                kwargs.pop('dissimilarity')
            else:
                dissimilarity = 'precomputed'

            embedded = MDS(metric=True,
                           n_components=n_components,
                           verbose=verbose,
                           dissimilarity=dissimilarity,
                           random_state=random_state,
                           **kwargs)

        embedded = embedded.fit(dist_matrix)

        return embedded
Exemplo n.º 24
0
    def embed(self, M):
        """Embed a distance matrix using TSNE.

        Parameters
        ----------
        M : :obj:`ndarray`
            The distance matrix to be embedded

        Returns
        -------
        :obj:`ndarray`
            A :obj:`ndarray` of the embedding.

        """
        tsne = TSNE(n_components=self.num_components, metric="precomputed")
        tsne.fit(M)
        emb = tsne.embedding_
        return emb
Exemplo n.º 25
0
def train_tsne(training_size=2000,
               metric='cosine',
               n_components=3,
               perplexity=100,
               angle=.12):
    # adjust this downward to see it it affects accuracy
    np = pd.np

    tweets = read_csv(os.path.join(BIGDATA_PATH, 'tweets.csv.gz'))
    tweets = tweets[tweets.isbot >= 0]
    gc.collect()  # reclaim RAM released above

    # labels3 = tweets.isbot.apply(lambda x: int(x * 3))
    labels = tweets.isbot.apply(lambda x: int(x * 2))

    lsa = LsiModel.load(
        os.path.join(BIGDATA_PATH, 'lsa_tweets_5589798_2003588x200.pkl'))
    tfidf = TfidfModel(id2word=lsa.id2word, dictionary=lsa.id2word)
    bows = np.array([lsa.id2word.doc2bow(txt.split()) for txt in tweets.text])
    # tfidfs = tfidf[bows]

    X = pd.DataFrame(
        [pd.Series(dict(v)) for v in tqdm(lsa[tfidf[bows]], total=len(bows))],
        index=tweets.index)

    mask = ~X.isnull().any(axis=1)
    mask.index = tweets.index
    # >>> sum(~mask)
    # 99
    # >>> tweets.loc[mask.argmin()]
    # isbot                 0.17
    # strict                  13
    # user      b'CrisParanoid:'
    # text         b'#sad again'
    # Name: 571, dtype: object

    X = X[mask]
    y = tweets.isbot[mask]
    labels = labels[mask]

    test_size = 1.0 - training_size if training_size < 1 else float(
        len(X) - training_size) / len(X)
    Xindex, Xindex_test, yindex, yindex_test = train_test_split(
        X.index.values, y.index.values, test_size=test_size)
    X, Xtest, y, ytest = X.loc[Xindex], X.loc[Xindex_test], y.loc[
        yindex], y.loc[yindex_test]

    # labels_test = labels.loc[yindex_test]
    labels = labels.loc[yindex]

    tsne = TSNE(metric='precomputed',
                n_components=n_components,
                angle=angle,
                perplexity=perplexity)
    tsne = tsne.fit(positive_distances(X.values, metric=metric))

    return tsne, X, Xtest, y, ytest
Exemplo n.º 26
0
def dimension_reduction_TSNE(arr0, n_components=2):
    matrix = np.array(arr0)
    t_sne = TSNE(n_components=n_components, random_state=0)
    np.set_printoptions(suppress=True)
    result = t_sne.fit(matrix)
    kl_divergence = result.kl_divergence_
    # label = data_utility.retrieve_nan_index(t_sne.fit_transform(matrix).tolist(), index)
    label = t_sne.fit_transform(matrix).tolist()
    return label, kl_divergence
Exemplo n.º 27
0
class TSNERepresentation(Representation):
    @staticmethod
    def default_config():
        default_config = Representation.default_config()

        # parameters
        default_config.parameters = Dict()
        default_config.parameters.perplexity = 30.0
        default_config.parameters.init = "random"
        default_config.parameters.random_state = None

        return default_config

    def __init__(self, n_features=28 * 28, n_latents=10, config={}, **kwargs):
        Representation.__init__(self, config=config, **kwargs)

        # input size (flatten)
        self.n_features = n_features
        # latent size
        self.n_latents = n_latents
        # feature range
        self.feature_range = (0.0, 1.0)

        self.algorithm = TSNE(n_components=self.n_latents)
        self.update_algorithm_parameters()

    def fit(self, X_train, update_range=True):
        ''' 
        X_train: array-like (n_samples, n_features)
        '''
        X_train = np.nan_to_num(X_train)
        if update_range:
            self.feature_range = (X_train.min(axis=0), X_train.max(axis=0))  # save (min, max) for normalization
        X_train = (X_train - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0])
        self.algorithm.fit(X_train)

    def calc_embedding(self, x):
        x = (x - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0])
        x = self.algorithm.transform(x)
        return x

    def update_algorithm_parameters(self):
        self.algorithm.set_params(**self.config.parameters, verbose=False)
Exemplo n.º 28
0
def TSNE(X_train, y_train=None, X_test=None, n=100):
    from sklearn.manifold import TSNE
    mod = TSNE(n_components=n)
    X = mod.fit(X_train, y_train)
    test = mod.transform(X_train)
    if X_test is None:
        out = train
    else:
        test = pca.transform(X_test)
        out = train, test
    return out
def input_stats(df=None):
    '''Input dataframe; Enter parameter required; Output Descriptive statistics for each groups
        ;Using Tsne and KMeans as methods'''
    df_num = df.select_dtypes('number')
    ds = StandardScaler().fit_transform(df_num)
    data_scaled = pd.DataFrame(ds,columns=df_num.columns)
    print('Numeric shape of dataframe is: ',df_num.shape)

    tsne = TSNE()
    tsne.fit(data_scaled)
    te = tsne.embedding_
    tsne_df = pd.DataFrame(te,columns=['e1','e2'])
    
    s,e = int(input('k range start:')), int(input('k range end:'))

    krange = range(s,e+1)
    inertia =[]
    silo = []
    for k in krange:
        kmodel = KMeans(k)
        k_labs = kmodel.fit_predict(tsne_df)
        inertia.append(kmodel.inertia_)
        silo.append(silhouette_score(tsne_df,k_labs))
    
    print('Be advice! You will have to choose k from below two graphs for next process!')

    sns.lineplot(krange,inertia)
    plt.title('k value and inertia')
    plt.show()

    sns.lineplot(krange, silo)
    plt.title('k value and silhouette score')
    plt.show()

    dfcopy = df.copy()
    k = int(input('input optimal k:'))
    km = KMeans(k)
    k_labs = km.fit_predict(tsne_df)
    dfcopy['kmeans_labels'] = k_labs

    return (dfcopy.groupby('kmeans_labels').mean().T)
Exemplo n.º 30
0
def compute_cluster_color(nodes, vectors, k):
    vector_list = []
    for key in vectors.keys():
        vector_list.append(vectors[key])
    tsne = TSNE(n_components=3)
    tsne.fit(vector_list)
    newX = tsne.fit_transform(vector_list)
    temp_vectors = {}
    temp_nodes = list(nodes)
    for i in range(0, len(newX)):
        temp_vectors[temp_nodes[i]] = newX[i]
    clusters = kmeans(vectors, K=k)
    # clusters = mean_shift(temp_vectors)
    # clusters = dbscan(vectors)
    # clusters = dbscan(temp_vectors)
    # clusters = optics(vectors)
    color_list = []
    for node in nodes:
        c = COLOR_MAP[clusters[node]]
        color_list.append(c)
    return clusters, color_list
Exemplo n.º 31
0
 def data_embedding(self, type='TSNE'):
     '''
     Fit distance matrix into two-dimensions embedded space using
     the TSNE or MDS model
     '''
     if type == 'TSNE':
         model = TSNE(n_components=2, metric='precomputed')
     if type == 'MDS':
         model = MDS(n_components=2, max_iter=3000, eps=1e-9,
                     dissimilarity="precomputed", n_jobs=1)
     # position of points in embedding space
     pos = model.fit(self.distance_matrix).embedding_
     return pos
Exemplo n.º 32
0
def topic_classification_gensim_fit(filename_2, topic_number, top_idf_number, lda_model, common_dictionary):
    topic_1 = [0.00 for n in range(topic_number)]
    common_texts = process_doc(filename_2, top_idf_number)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    Y = []
    for unseen_doc in common_corpus:
        vector = lda_model[unseen_doc]
        y = np.zeros(35)
        for vec in vector:
            topic_1[vec[0]] = topic_1[vec[0]]+vec[1]
            y[vec[0]] = vec[1]
        Y.append(y)
    Y = np.array(Y)
    tsne = TSNE(n_components=2)
    tsne.fit(Y)
    #print(tsne.embedding_)
    plt.plot(tsne.embedding_[:,0],tsne.embedding_[:,1])
    plt.show()
    topic_1 = np.array(topic_1)/np.linalg.norm(topic_1)
    print(filename_2 + " word distribution:")
    print(topic_1)
    return topic_1
print("prefilter_train: ", prefilter_train.shape)
print("prefilter_test: ", prefilter_test.shape)

print("Performing PCA")
X_pca = pca(prefilter_train)
plotScatter(X_pca, y_train, title="6_PCA reduction (2d) of auto-encoded data (%dd)" % prefilter_train.shape[1])

print("Performing TSNE")
model = TSNE(n_components=2, random_state=0, init="pca")
toPlot = model.fit_transform(prefilter_train[:1000])
plotTSNE(toPlot, y_train[:1000], nb_classes, "7_t-SNE embedding of auto-encoded data ")


print("Classifying and comparing")
# Classify results from Autoencoder
print("Building classical fully connected layer for classification")
model = Sequential()
model.add(Dense(prefilter_train.shape[1], nb_classes, activation=activation))

model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(prefilter_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=False, verbose=0, validation_data=(prefilter_test, Y_test))

score = model.evaluate(prefilter_test, Y_test, verbose=0, show_accuracy=True)
print('\nscore:', score)

print('Loss change:', 100*(score[0] - classical_score[0])/classical_score[0], '%')
print('Accuracy change:', 100*(score[1] - classical_score[1])/classical_score[1], '%')

Exemplo n.º 34
0
    def __plot_samples__(self, dfs, fold):
        """
        :type dfs: List[pandas DataFrame]      # [training df, testing df]
        :type fold: int
        :rtype: None
        """

        mds  = MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity='euclidean', n_jobs=-1)
        tsne = TSNE(n_components=2)

        # change label to color index
        #   author 1 train (0 = light blue), author 1 test (1 = dark blue)
        #   author 2 train (2 = light green), author 2 test (3 = dark green)
        df_all = pd.DataFrame(columns = dfs[0].columns)
        df0_copy = dfs[0].copy()
        df0_copy.loc[(df0_copy.label ==  1).values, 'label'] = 0
        df0_copy.loc[(df0_copy.label == -1).values, 'label'] = 2
        df_all = df_all.append(df0_copy)

        df1_copy = dfs[1].copy()
        df1_copy.loc[(df1_copy.label ==  1).values, 'label'] = 1
        df1_copy.loc[(df1_copy.label == -1).values, 'label'] = 3
        df_all = df_all.append(df1_copy)

        legend = {0: 'Author 1 Training Sample',
                  1: 'Author 1 Test Sample',
                  2: 'Author 2 Training Sample' ,
                  3: 'Author 2 Test Sample' }

        # fit on training data
        pos_lst = [('Multi-Dimensional Scaling (MDS)',
                    mds.fit(df_all.drop('label', axis=1)).embedding_),

                   ('t-Distributed Stochastic Neighbor Embedding (TSNE)',
                    tsne.fit(df_all.drop('label', axis=1)).embedding_)]


        # plot
        colors = sns.color_palette('Paired', 4)
        fig = plt.figure(figsize=(16,7))

        plt.hold(True)
        for k, (title, pos) in enumerate(pos_lst, 1):

            ## fig.add_subplot() works in ipython notebook but creates a
            ## mysterious 3rd axes in python...
            # ax = fig.add_subplot(1,2,k)

            ax = plt.subplot(1,2,k)
            ax.set_title(title)

            for i in xrange(len(colors)):
                samples = pos[(df_all.label == i).values, :]
                ax.scatter(samples[:,0], samples[:,1],
                           c=colors[i], edgecolor='none',
                           label=legend[i])
            ax.legend()

        plt.hold(False)

        plt.savefig('../figs/' + \
                   self.__PG_STATS_TBL__[self.__PG_STATS_TBL__.find("_")+1:] + \
                   'fold' + str(fold) + '.png',
                   dpi=300, transparent=True)

        plt.close(fig)
Exemplo n.º 35
0
    img = imread(df.local_path.loc[i])
    
    if img.shape[0] < 200 or img.shape[1] < 200:
        df.drop(i)
    else:
		img_gray = color.rgb2gray(img)
		fd = hog(img_gray, orientations=9, pixels_per_cell=(8, 8),cells_per_block=(4, 4))
		vector_list.append(fd)
		print i, len(fd), df.local_path.loc[i]
    
X = np.vstack(vector_list)

from sklearn.manifold import TSNE as tsne

tsne = tsne(n_components=2)
tsne.fit(X)
subspace_tsne = pd.DataFrame(tsne.fit_transform(X),columns=["x","y"])

num_bins = 64

subspace_tsne['grid_x'] = pd.cut(subspace_tsne['x'],num_bins,labels=False)
subspace_tsne['grid_y'] = pd.cut(subspace_tsne['y'],num_bins,labels=False)

subspace_tsne['local_path'] = df.local_path[:len(subspace_tsne)]

# I should save the dataframe here, so later maybe I can use full images

thumb_side = 128

from PIL import Image
Exemplo n.º 36
0
import matplotlib.pyplot as plt # matplotlib 1.4.3 
from sklearn.manifold import TSNE # scikit-learn 0.17
import pandas # pandas 0.16.2

# Read data
data = pandas.read_csv("data.csv", sep=",")
# Fit model
model = TSNE(n_components=2, perplexity=10, verbose=2, method='barnes_hut', init='pca', n_iter=1000)
model.fit(data.values.T) 
# Plot results
hFig, hAx = plt.subplots()
hAx.scatter(model.embedding_[:, 0], model.embedding_[:, 1], 20, color="grey")
for i, txt in enumerate(data.keys()):
    hAx.annotate(txt, (model.embedding_[i, 0], model.embedding_[i, 1]))