def get_clusters():
    kernel = ['linear', 'cosine', 'sigmoid', 'polynomial']
    for ii in np.arange(32, 37, 1):
        for ij in kernel:
            pca = kPCA(k = ii, kernel = ij).fit(np.array(simscore))
            pca = pca.components_.T
            km = kkMeans(k = ii, kernel = ij, gamma = 1).fit_predict(pca)
            cluster_labels = km.clusters
            if not os.path.exists(os.path.join(path, 'labels')):
                os.makedirs(os.path.join(path, 'labels'))
                pd.DataFrame(cluster_labels).to_csv(os.path.join(path, f'labels/labels_{ii}_{ij}.csv'))
            else:
                pd.DataFrame(cluster_labels).to_csv(os.path.join(path, f'labels/labels_{ii}_{ij}.csv'))
Exemplo n.º 2
0
def update_figure(make_selection, g_m, knl, drop, yaxis, clust):
    #    data_places = data[(data.year_edited >= make_selection[0]) & (data.year_edited <= make_selection[1])]
    ts = pd.read_csv(os.path.join(path,
                                  f'tsne/tsne_{int(clust)}.csv')).iloc[:, 1:]
    ts = ts.sort_values(by=['year'])
    data_places = ts[(ts.year >= make_selection[0])
                     & (ts.year <= make_selection[1])]
    if g_m == 'Cluster':
        if drop != []:
            traces = []
            for val in drop:
                traces.append(go.Scattergl(
                        x = np.array(data_places.loc[data_places.year == int(val), '0']),
                        y = np.array(data_places.loc[data_places.year == int(val), '1']),
                        text = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\
                                 data_places.loc[data_places['year'] == int(val), 'pdf_names'].apply(lambda x: x.split('.')[0]),\
                                 data_places.loc[data_places['year'] == int(val), 'year'],\
                                 data_places.loc[data_places['year'] == int(val), 'language'],\
                                 data_places.loc[data_places['year'] == int(val), 'authors'],\
                                data_places.loc[data_places['year'] == int(val), 'title'])],
                        customdata = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\
                                 data_places.loc[data_places['year'] == int(val), 'pdf_names'].apply(lambda x: x.split('.')[0]),\
                                 data_places.loc[data_places['year'] == int(val), 'year'],\
                                 data_places.loc[data_places['year'] == int(val), 'language'],\
                                 data_places.loc[data_places['year'] == int(val), 'authors'],\
                                data_places.loc[data_places['year'] == int(val), 'title'])],
                        mode = 'markers',
                        opacity = 0.6,
                        marker = {'size': 15,
                                  'line': {'width': 0.5, 'color': 'white'}},
                        name = val,
                        ))

            return {
                'data':
                traces,
                'layout':
                go.Layout(xaxis={'title': 'tsne-2'},
                          yaxis={
                              'type': 'linear' if yaxis == 'Linear' else 'log',
                              'title': 'tsne-1'
                          },
                          margin={
                              'l': 40,
                              'b': 40,
                              't': 10,
                              'r': 10
                          },
                          legend={
                              'x': 1,
                              'y': 1
                          },
                          hovermode='closest')
            }
        else:
            pca = kPCA(k=int(clust), kernel=knl).fit(np.array(simscore))
            pca = pca.components_.T
            km = kkMeans(k=int(clust), kernel=knl, gamma=1).fit_predict(pca)
            cluster_labels = km.clusters
            ts = pd.read_csv(os.path.join(
                path, f'tsne/tsne_{int(clust)}.csv')).iloc[:, 1:]
            ts = ts[(ts.year >= make_selection[0])
                    & (ts.year <= make_selection[1])]
            traces = go.Scattergl(
                    x = np.array(ts)[:, 0],
                    y = np.array(ts)[:, 1],
                    text = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\
                                 ts['pdf_names'].apply(lambda x: x.split('.')[0]),\
                                 ts['year'],\
                                 ts['language'],\
                                 ts['authors'],\
                                 ts['title'])],
                    customdata = [(x, y, z, w, p) for (x, y, z, w, p) in zip(\
                                  ts['pdf_names'].apply(lambda x: x.split('.')[0]),\
                                  ts['year'],\
                                 ts['language'],\
                                 ts['authors'],\
                                ts['title'])],
                    mode = 'markers',
                    opacity = 0.7,
                    marker = {'size': 15,
            #                          'opacity': 0.9,
                              'color': cluster_labels,
                              'colorscale':'Viridis',
                              'line': {'width': .5, 'color': 'white'}},
                    )

            return {
                'data': [traces],
                'layout':
                go.Layout(height=600,
                          xaxis={'title': 'tsne-2'},
                          yaxis={
                              'type': 'linear' if yaxis == 'Linear' else 'log',
                              'title': 'tsne-1'
                          },
                          margin={
                              'l': 40,
                              'b': 40,
                              't': 10,
                              'r': 10
                          },
                          legend={
                              'x': 1,
                              'y': 1
                          },
                          hovermode='closest')
            }
    else:
        ss = np.array(simscore)
        m, n = ss.shape
        G = nx.Graph()
        for n in range(m):
            G.add_node(n)
        for i in range(m):
            for j in range(n):
                if ss[i, j] != 0 and i != j:
                    G.add_edge(i, j)
        E = [edg for edg in G.edges]
        pos = nx.fruchterman_reingold_layout(G)
        Xv = [pos[k][0] for k in range(n)]
        Yv = [pos[k][1] for k in range(n)]
        Xed = []
        Yed = []
        for edge in E:
            Xed += [pos[edge[0]][0], pos[edge[1]][0], None]
            Yed += [pos[edge[0]][1], pos[edge[1]][1], None]

        etrace = go.Scattergl(x=Xed,
                              y=Yed,
                              mode='lines',
                              line=dict(color='rgb(210,210,210)', width=.5),
                              hoverinfo='none')

        vtrace = go.Scattergl(
            x=Xv,
            y=Yv,
            mode='markers',
            name='net',
            marker=dict(symbol='circle-dot',
                        size=5,
                        color='#6959CD',
                        line=dict(color='rgb(50,50,50)', width=0.5)),
            #                       text = labels,
            hoverinfo='text')

        return {
            'data': [etrace, vtrace],
            'layout':
            go.Layout(height=600,
                      xaxis={'title': 'year'},
                      yaxis={
                          'type': 'linear' if yaxis == 'Linear' else 'log',
                          'title': 'Similarity score'
                      },
                      margin={
                          'l': 40,
                          'b': 40,
                          't': 10,
                          'r': 10
                      },
                      legend={
                          'x': 1,
                          'y': 1
                      },
                      hovermode='closest')
        }
Exemplo n.º 3
0
        'variance': []
    },
    'etakernel': {
        'time': [],
        'variance': []
    },
    'laplace': {
        'time': [],
        'variance': []
    }
}

for p, q in data_name.items():
    for ii in kernels:
        start = time.time()
        kpca = kPCA(k=2, kernel=ii).fit(q)
        end = time.time() - start
        kernel_outcome[ii][f'{p}'] = kpca.fit_transform()
        kernel_outcome[ii]['time'].append(end)
        kernel_outcome[ii]['variance'].append(kpca.explained_variance)

#pca = kPCA(k = 2, kernel = kernels[0]).fit(dfclass)
#newX = pca.fit_transform()
#plt.scatter(newX[:, 0], newX[:, 1], c = yclass,  s = 3)

#%% Visualize dataset

fig, ax = plt.subplots(5,
                       1,
                       figsize=(2, 7),
                       gridspec_kw=dict(hspace=0.01, wspace=0.01),
Exemplo n.º 4
0
    def fit_predict(self, ds_x = None, ds_y = None, dt_x = None, \
            dt_y = None, d = None, type = None, m_kernel = None):
        '''Domain Adaptation using Subspace Alignment
        :param: ds_x: NxD
        :param: ds_y: Dx1
        :param: dt_x: NxD
        :param: dt_y: Dx1
        :param: d: Number of principal components
        '''
        if ds_x is None:
            raise IOError('Source Input data in required')
        else:
            self.ds_x = ds_x
        if ds_y is None:
            raise IOError('Source Input labels in required')
        else:
            self.ds_y = ds_y.ravel()

        if dt_x is None:
            raise IOError('Target Input data in required')
        else:
            self.dt_x = dt_x
        if dt_y is None:
            raise IOError('Target Input labels in required')
        else:
            self.dt_y = dt_y.ravel()
        if d is None:
            d = 2
            self.d = d
        else:
            self.d = d
        if not m_kernel:
            m_kernel = 'linear'
            self.m_kernel = m_kernel
        else:
            self.m_kernel = m_kernel
        #ignore warning when scaling data using MinMaxScaler
        warnings.filterwarnings('ignore', category=DataConversionWarning)
        #find PCA for Source domain after scaling
        X_w = MinMaxScaler().fit_transform(self.ds_x).astype(
            float)  #scale source data
        if not type:
            X_s = kPCA(k=self.d, kernel=self.m_kernel).fit(X_w.T)  #perform PCA
        else:
            X_s = PCA(k=self.d).fit(X_w)
        X_s = X_s.components_.T  #get components

        #PCA for target domain after scaling
        X_d = MinMaxScaler().fit_transform(self.dt_x).astype(
            float)  #scale target data
        if not type:
            X_t = kPCA(k=self.d, kernel=self.m_kernel).fit(X_d.T)  #perform PCA
        else:
            X_t = PCA(k=self.d).fit(X_d)
        self.X_t = X_t.components_.T  #get components
        #compute source and target projections using subspace alignment matrix
        self.X_a = X_s.dot(X_s.T.dot(self.X_t))
        self.S_a = self.ds_x.dot(self.X_a)  #source projection
        self.T_a = self.dt_x.dot(self.X_t)  #target projection
        print(f'>>>> Done with Subspace alingment and Data projection >>>>')
        #perform classification
        '''
        Fit a 1-NN classifier on S_a and make predictions on T_a
        '''
        print('*' * 40)
        print('Initializing 1-Nearest Neighbour classifier')
        self.classifier = KNeighborsClassifier(n_neighbors=1)
        self.classifier.fit(self.S_a, self.ds_y)
        print('>>>> Done fitting source domain >>>>')
        self.ypred = self.classifier.predict(self.T_a)
        self.accuracy = EvalC.accuary_multiclass(self.dt_y, self.ypred)
        print(f'Accuracy: {self.accuracy}')
        return self