def set_ca(X): ca = prince.CA(n_components=2, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42) ca = ca.fit(X) return ca
def CorrespondanceAnalysis(df, theta=1, dimensions=3, coordinates='top', all_nodes=False): # Distinguishing coordinate edges from entities nodes entities = 'bottom' if coordinates == 'top' else 'top' # Degree of entity nodes degrees = df.groupby(entities).count()[coordinates].sort_values( ascending=False) # filtering edge list to delete nodes that df = df[df[entities].isin(degrees[degrees >= theta].index.values)] df['w'] = 1 # Prototype version: Pivoting adjacency matrix in chunks # chunk_size = 100000 # chunks = [x for x in range(0, df.shape[0], chunk_size)] # M = pd.concat([df.loc[df[entities].isin(degrees[degrees>=theta].index.values),[entities,coordinates,'w']].iloc[ chunks[i]:chunks[i + 1] - 1 ].pivot(index=entities, columns=coordinates, values='w') for i in range(0, len(chunks) - 1)]) # Current version: pivoting adjacency matrix # Will fail for large sizes (long tailed degree distributions, theta=1) M = df[[entities, coordinates, 'w']].pivot(index=entities, columns=coordinates, values='w') M.fillna(0, inplace=True) # Selecting the core sub-graph that will be used in the CA M_mask = (~M.duplicated()) selected_rows = M.loc[M_mask].index.values unselected_rows = M.loc[~M_mask].index.values # Correspondent Analysis ca = prince.CA(n_components=dimensions, n_iter=4, copy=True, check_input=True, engine='auto', random_state=np.random.randint(1, 100)) ca = ca.fit(M[M_mask]) if all_nodes: row_coords = ca.row_coordinates(M) else: row_coords = ca.row_coordinates(M[M_mask]) col_coords = ca.column_coordinates(M[M_mask]) output = pd.concat([row_coords, col_coords], axis=0) info = { 'explained_inertia': ca.explained_inertia_, } return output, info
def fitModelAndDraw(config, source, data, __title__, brand, feature1, feature2): ca = pr.CA(n_components=2, n_iter=10, copy=True, check_input=True, engine='auto', random_state=42) ca_data = getStructureDF(data=data, feature1=feature1, feature2=feature2) caObj = ca.fit(ca_data) __ax__ = caObj.plot_coordinates(X=ca_data, ax=None, figsize=(20, 12), x_component=0, y_component=1, show_row_labels=True, show_col_labels=True) __ax__.axis("off") __ax__.axhline(False) __ax__.axvline(False) def bubblePlotData(data_ca): cols_cnt = ["groups", "count"] cols = ["groups", "X", "Y", "group_flag"] cols_order = ["group_flag", "groups", "X", "Y"] theme_coord = caObj.row_coordinates(data_ca).reset_index() theme_coord["group_flag"] = "Themes" theme_coord.columns = cols emotion_coord = caObj.column_coordinates(data_ca).reset_index() emotion_coord["group_flag"] = "Emotions" emotion_coord.columns = cols coord_data = theme_coord.append(emotion_coord) coord_data = coord_data[cols_order] freq_dist_theme = data[feature1].value_counts().reset_index() freq_dist_theme.columns = cols_cnt freq_dist_emotion = data[feature2].value_counts().reset_index() freq_dist_emotion.columns = cols_cnt freq_dist = freq_dist_theme.append(freq_dist_emotion) buble_plot_data = coord_data.merge(freq_dist, on="groups", how="left") return buble_plot_data bubble_plot_data = bubblePlotData(ca_data) bubble_plot_data.to_csv(config['PATHS']['BASEDIR'] + "\\outputs\\" + source + "_bubble_plot_data_" + brand + ".csv", index=False) logger.info("Plotting Corresponding Chart...!!!") plt.title(feature1 + ' v/s ' + feature2) plt.savefig(config['PATHS']['BASEDIR'] + "\\outputs\\" + source + "_CA_" + brand + "_" + feature1 + "_" + feature2 + ".png")
def conductCA(y): ca = prince.CA(n_components=4, n_iter=5, copy=True, check_input=True, engine='auto', random_state=42) ca = ca.fit(y) phi = ca.column_coordinates(y) theta = (phi - phi.mean()) / phi.std() rowcoord = ca.row_coordinates(y) return (ca, phi, rowcoord, theta)
def correspondence_analysis(self, target=None): if target is None: target = self.target_variable if target == None: print( "Please specify a categorical column as x-axis using 'target' argument" ) return if self.df[target].dtype not in CATEGORICAL_TYPES: print("Target must be a categorical column.") else: num_plots = len(self.cat_cols) - 1 if num_plots == 0: print( "Correspondence Analysis requires at least 2 numerical variables" ) else: total_cols = 2 total_rows = int(np.ceil(num_plots / total_cols)) fig, axs = plt.subplots(nrows=total_rows, ncols=total_cols, figsize=(7 * total_cols, 7 * total_rows), constrained_layout=True, squeeze=False) fig.suptitle('Correspondence Analysis for column: ' + target, fontsize=20) # exclude target variable non_target_list = [i for i in self.cat_cols if i != target] for i, col in enumerate(non_target_list): row = i // total_cols pos = i % total_cols X = self.df.copy() X = pd.crosstab(X[col], X[target]) ca = prince.CA(n_components=2, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42) ca = ca.fit(X) ax = ca.plot_coordinates(X=X, ax=axs[row][pos], x_component=0, y_component=1, show_row_labels=True, show_col_labels=True)
def calculate_correspondence(data, sup_rows=None, sup_cols=None): """ Calculate correspondence analysis Args: data (list[list[float]]): Input factor data sup_rows (list[int]): Supplementary rows indexes sup_cols (list[int]): Supplementary columns indexes Returns: dict: Calculation results """ if sup_rows is None: sup_rows = [] if sup_cols is None: sup_cols = [] # Calculate correspondence df = pd.DataFrame(data) df2 = df.loc[~df.index.isin(sup_rows), ~df.columns.isin(sup_cols)] ca = prince.CA(n_components=min(df2.shape) - 1).fit(df2) # Calculate supplementary factors sup_row_res, sup_col_res = None, None if sup_rows: sup_df = df.loc[df.index.isin(sup_rows), ~df.columns.isin(sup_cols)] sup_row_res = sup_df.divide(sup_df.sum(axis=1), axis=0) @ ca.V_.T if sup_cols: sup_df = df.loc[~df.index.isin(sup_rows), df.columns.isin(sup_cols)] sup_col_res = sup_df.divide(sup_df.sum(axis=0), axis=1).T @ ca.U_ # Calculate quality rows = pd.concat([ca.row_coordinates(df2), sup_row_res]).sort_index() rows2 = rows**2 rows_quality = rows2.divide(rows2.sum(axis=1), axis=0).loc[:, :1].sum(axis=1) rows_quality.loc[rows_quality.index.isin(sup_rows)] = 0 cols = pd.concat([ca.column_coordinates(df2), sup_col_res]).sort_index() cols2 = cols**2 cols_quality = cols2.divide(cols2.sum(axis=1), axis=0).loc[:, :1].sum(axis=1) cols_quality.loc[cols_quality.index.isin(sup_rows)] = 0 # Prepare result data return { 'rows': rows.loc[:, :1].values.tolist(), 'cols': cols.loc[:, :1].values.tolist(), 'rows_quality': rows_quality.values.tolist(), 'cols_quality': cols_quality.values.tolist(), 'explained': [x * 100 for x in ca.explained_inertia_[:2]], 'eigenvalues': ca.eigenvalues_[:2], }
def ca(self, df, col1, col2): """ col1, col2: categorical column names Do a correspondence analysis """ X = pd.crosstab(df[col1], df[col2]) ca = prince.CA(n_components=2, n_iter=3, copy=True, check_input=True, engine='auto') ca = ca.fit(X) fig, ax = plt.subplots(figsize=(10, 10)) ca.plot_coordinates(X=X, ax=ax, x_component=0, y_component=1, show_row_labels=True, show_col_labels=True) plt.show() print("Explained inertia:", ca.explained_inertia_) return ca
def display(self): if self.df is None: self.load() # display if loaded if self.df is not None: X = self.df ca = prince.CA(n_components=2,n_iter=3,copy=True,check_input=True,engine='auto',random_state=42) ca = ca.fit(X) coordinat=ca.column_coordinates(X) koordinat=ca.row_coordinates(X) koordinatx=koordinat.loc[:,0] koordinaty=koordinat.loc[:,1] coordinatx=coordinat.loc[:,0] coordinaty=coordinat.loc[:,1] self.text.tag_configure('big', font=('Verdana',10,'bold'), foreground= 'blue') self.text.insert('end', 'KOORDINAT VARIABEL\n', 'big', koordinat) self.text.insert('end', '\nKOORDINAT OBJEK\n', 'big', coordinat) if self.df1 is None: self.load1() if self.df2 is None: self.load2() if self.df1 is not None: if self.df2 is not None: NamaObjek=self.df1 NamaVariabel=self.df2 fig1, ax1 = plt.subplots() ax1.scatter(koordinatx,koordinaty) ax1.scatter(coordinatx,coordinaty) ax1.set_title('Plot Analisis Korespondensi') for (Objek1, _x1, _y1) in zip(NamaObjek, koordinatx, koordinaty): ax1.annotate(Objek1, (_x1, _y1), color='green') for (Variabel1, _x2, _y2) in zip(NamaVariabel, coordinatx, coordinaty): ax1.annotate(Variabel1, (_x2, _y2), color='blue') plt.axhline(y=0, xmin=0, xmax=1, linewidth=2, color='k') plt.axvline(x=0, ymin=0, ymax=1, linewidth=2, color='k') plt.show()
for i, ax in enumerate(axes): pc_loadings = loadings.loc[i, :] colors = ['C0' if l > 0 else 'C1' for l in pc_loadings] ax.axhline(color='#888888') pc_loadings.plot.bar(ax=ax, color=colors) ax.set_ylabel(f'PC{i+1}') ax.set_ylim(-maxPC, maxPC) plt.tight_layout() plt.show() ### Correspondence analysis housetasks = pd.read_csv(HOUSE_TASKS_CSV, index_col=0) ca = prince.CA(n_components=2) ca = ca.fit(housetasks) ca.plot_coordinates(housetasks, figsize=(6, 6)) plt.tight_layout() plt.show() ## K-Means Clustering ### A Simple Example df = sp500_px.loc[sp500_px.index >= '2011-01-01', ['XOM', 'CVX']] kmeans = KMeans(n_clusters=4).fit(df) df['cluster'] = kmeans.labels_ print(df.head()) centers = pd.DataFrame(kmeans.cluster_centers_, columns=['XOM', 'CVX'])
def test_fit_numpy_array(self): ca = prince.CA(n_components=2) self.assertTrue(isinstance(ca.fit(self.X.values), prince.CA))
def test_fit_pandas_dataframe(self): ca = prince.CA(n_components=2) self.assertTrue(isinstance(ca.fit(self.X), prince.CA))
import matplotlib.pyplot as plt import pandas as pd import prince df = pd.read_csv('data/woman_work.csv', index_col=0) df = df[['Stay at home', 'Part-time work', 'Full-time work']] ca = prince.CA(df, n_components=-1) fig1, ax1 = ca.plot_cumulative_inertia() fig2, ax2 = ca.plot_rows_columns(show_row_labels=True, show_column_labels=True) plt.show()
#print(keyList) indexMainKey = int(sys.argv[2]) X = data.copy() del X[keyList[0]] X.rename(index=pd.Series(data[keyList[0]]), inplace=True) #principalDf = pd.DataFrame(data = data[keyList[indexMainKey:]] # , columns = pd.Series(keyList[indexMainKey:]), index=pd.Series(data[keyList[0]])) print(X) ca = prince.CA(n_components=3, n_iter=25, copy=True, check_input=True, engine='auto', random_state=None) X.columns.rename('Caracteristicas', inplace=True) X.index.rename('Ciudades', inplace=True) ca = ca.fit(X) print(ca.row_coordinates(X)) print(ca.column_coordinates(X)) print(ca.eigenvalues_) print(ca.total_inertia_) print(ca.explained_inertia_)
def CA(file): """correspondence analysis. Args: file (directory): csv file contains genes' RSCU values Returns: - csv file contains genes' values for the first 4 axes of the correspondence analysis result - csv file contains codons' values for the first 4 axes of the correspondence analysis result - plot the genes first 2 axes values of the correspondence analysis result - plot the codons first 2 axes values of the correspondence analysis result """ import pandas as pd import prince import matplotlib.pyplot as plt file = str(file) df = pd.read_csv(file) df.set_index(df.iloc[:,0] , inplace=True)# to make the first column is the index df.drop(df.columns[0], axis=1,inplace= True) df.replace(0,0.0000001,inplace=True) #with prince # make onle CA for 2 axis ca = prince.CA( n_components=4, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42 ) df.columns.rename('Gene Name', inplace=True) df.index.rename('Codons', inplace=True) ca = ca.fit(df) codons = ca.row_coordinates(df) # for Codons genes = ca.column_coordinates(df) #for genes #ca.eigenvalues_ ca.total_inertia_ #total inertia ca.explained_inertia_ #inertia for each axis inertia = ca.explained_inertia_ #save information file_genes = file.replace(".csv",'') file_genes = file_genes + "genes" file_genes = file_genes + ".csv" genes.rename(columns={genes.columns[0]: 'axis 1', genes.columns[1]: 'axis 2', genes.columns[2]: 'axis 3', genes.columns[3]: 'axis 4'}, inplace=True) genes.to_csv(file_genes,sep=',', index=True, header=True) # return csv file for genes ca result file_codons = file.replace(".csv",'') file_codons = file_codons+ "codons" file_codons = file_codons + ".csv" codons.rename(columns={codons.columns[0]: 'axis 1', codons.columns[1]: 'axis 2', codons.columns[2]: 'axis 3', codons.columns[3]: 'axis 4'},inplace=True) codons.to_csv(file_codons, sep=',', index=True, header=True) # return csv file for codon ca result file_inertia = file.replace('.csv','.txt') with open(file_inertia, 'a') as f: f.write("explained inertia" + "\n") for i in range(len(inertia)): i_count = i + 1 with open(file_inertia,'a') as f: f.write ("axis " + str(i_count) + " = " + str(inertia[i]) + "\n" ) with open(file_inertia,'a') as f: f.write("Total Inertia = " + str(ca.total_inertia_)) #plot For genes plt.style.use('seaborn-dark-palette') fig = plt.figure() plt.xlabel("Axis 1") plt.ylabel("Axis 2") plt.title("CA-plot") plt.scatter(genes['axis 1'],genes['axis 2'],s=10,marker ='o') plt.axhline(0, color='black', linestyle='-') plt.axvline(0, color='black', linestyle='-') save_file_name__ca_plot = file + "_CA_gens_plot.png" plt.savefig(save_file_name__ca_plot) # return plot file for gene ca result #for codons plt.style.use('seaborn-dark-palette') fig3 = plt.figure() plt.xlabel("Axis 1") plt.ylabel("Axis 2") plt.title("CA-plot") plt.scatter(codons['axis 1'],codons['axis 2'], s=10,marker ='o') plt.axhline(0, color='black', linestyle='-') plt.axvline(0, color='black', linestyle='-') if len(codons) < 200: for x , y , t in zip(codons['axis 1'],codons['axis 2'] , codons.index.values): x = x * (1 + 0.01) y = y * (1 + 0.01) plt.text(x,y,t) file = file.replace('.csv','') save_file_name__ca_codons_plot = file + "_CA_codos_plot.png" plt.savefig(save_file_name__ca_codons_plot) # return plot file for codon ca result read_genes_file = pd.read_csv(file_genes) read_genes_file.rename(columns={genes.columns[0]: 'gene id', genes.columns[1]: 'axis 1', genes.columns[2]: 'axis 2'}, inplace=True) return read_genes_file
def test_negative_input(self): ca = prince.CA() self.X.iloc[0, 0] = -1 with self.assertRaises(ValueError): ca.fit(self.X)
def test_transform_numpy_array(self): ca = prince.CA(n_components=2) self.assertTrue(isinstance(ca.fit(self.X.values).transform(self.X.values), pd.DataFrame))
import os import sys sys.path.insert(1, os.path.join(sys.path[0], '..')) import pandas as pd import prince from sklearn import datasets X = pd.read_csv('children.csv', index_col=0) ca = prince.CA().fit(X) print('Eigenvalues') print(ca.eigenvalues_) print(ca.explained_inertia_) print('---') print('U') print(ca.U_[:5]) print('---') print('V') print(ca.V_) print('---') print('s') print(ca.s_) print('---') print('Row coords') print(ca.row_coordinates(X)[:5])
def test_transform_pandas_dataframe(self): ca = prince.CA(n_components=2) self.assertTrue(isinstance(ca.fit(self.X).transform(self.X), pd.DataFrame))
'Q36', 'Q37', 'Q39c' ] qqlist1 = [] for q in qqlist: qqlist1 += code[q]['qlist'] qqlist1 = qqlist1[:-1] X = data[qqlist1] X = X[X.T.notnull().all()] #X_index=X.index X = pd.DataFrame(X.as_matrix(), columns=qqlist1) X = X.fillna(0) # 分类 est = KMeans(3) # 4 clusters est.fit(X) labels = pd.Series(est.labels_) # 样本可视化 X_PCA = PCA(2).fit_transform(X) kwargs = dict(cmap=plt.cm.get_cmap('rainbow', 10), edgecolor='none', alpha=0.6) plt.scatter(X_PCA[:, 0], X_PCA[:, 1], c=labels, **kwargs) print(labels.value_counts()) X1 = X.copy() X1['labels'] = labels t = X1.groupby(['labels']).mean() #t=t.rename(columns=code['Q11']['code']) ca = prince.CA(t) ca.plot_rows_columns(show_row_labels=True, show_column_labels=True)
import matplotlib.gridspec as gridspec import numpy as np import prince from sklearn.cluster import KMeans from adjustText import adjust_text import matplotlib.gridspec as gridspec pic_type = 'png' data = pd.read_csv('./birdseed.csv', sep=',', header=0, index_col=0) data.columns.rename('Bird Type', inplace=True) data.index.rename('Seed Type', inplace=True) #Correspondence Analysis using the prince library ca = prince.CA(n_components=len(data), n_iter=3, copy=True, engine='auto') ca = ca.fit(data) #Plot the inertia to justify the use of this method plt.figure() plt.plot(np.cumsum(ca.explained_inertia_), label='Ind. Inertia') plt.plot(ca.explained_inertia_, label='Cum. Sum. of Inertia') plt.legend(loc=2, fancybox=True, framealpha=1) plt.title('Correspondence Analysis Inertia') plt.xlabel('Principal Components') plt.ylabel('Inertia') plt.savefig('CA Inertia.png', format='png', bbox_inches='tight') plt.show(False) #Let's try to apply k-means algorithm to the CA data to auto-identify clusters n_clusters = 3