예제 #1
0
def set_ca(X):
    ca = prince.CA(n_components=2,
                   n_iter=3,
                   copy=True,
                   check_input=True,
                   engine='auto',
                   random_state=42)
    ca = ca.fit(X)
    return ca
예제 #2
0
def CorrespondanceAnalysis(df,
                           theta=1,
                           dimensions=3,
                           coordinates='top',
                           all_nodes=False):

    # Distinguishing coordinate edges from entities nodes
    entities = 'bottom' if coordinates == 'top' else 'top'

    # Degree of entity nodes
    degrees = df.groupby(entities).count()[coordinates].sort_values(
        ascending=False)

    # filtering edge list to delete nodes that
    df = df[df[entities].isin(degrees[degrees >= theta].index.values)]

    df['w'] = 1

    # Prototype version: Pivoting adjacency matrix in chunks
    # chunk_size = 100000
    # chunks = [x for x in range(0, df.shape[0], chunk_size)]
    # M = pd.concat([df.loc[df[entities].isin(degrees[degrees>=theta].index.values),[entities,coordinates,'w']].iloc[ chunks[i]:chunks[i + 1] - 1 ].pivot(index=entities, columns=coordinates, values='w') for i in range(0, len(chunks) - 1)])

    # Current version: pivoting adjacency matrix
    # Will fail for large sizes (long tailed degree distributions, theta=1)
    M = df[[entities, coordinates, 'w']].pivot(index=entities,
                                               columns=coordinates,
                                               values='w')
    M.fillna(0, inplace=True)

    # Selecting the core sub-graph that will be used in the CA
    M_mask = (~M.duplicated())
    selected_rows = M.loc[M_mask].index.values
    unselected_rows = M.loc[~M_mask].index.values

    # Correspondent Analysis
    ca = prince.CA(n_components=dimensions,
                   n_iter=4,
                   copy=True,
                   check_input=True,
                   engine='auto',
                   random_state=np.random.randint(1, 100))
    ca = ca.fit(M[M_mask])
    if all_nodes:
        row_coords = ca.row_coordinates(M)
    else:
        row_coords = ca.row_coordinates(M[M_mask])
    col_coords = ca.column_coordinates(M[M_mask])

    output = pd.concat([row_coords, col_coords], axis=0)

    info = {
        'explained_inertia': ca.explained_inertia_,
    }

    return output, info
예제 #3
0
def fitModelAndDraw(config, source, data, __title__, brand, feature1,
                    feature2):
    ca = pr.CA(n_components=2,
               n_iter=10,
               copy=True,
               check_input=True,
               engine='auto',
               random_state=42)
    ca_data = getStructureDF(data=data, feature1=feature1, feature2=feature2)

    caObj = ca.fit(ca_data)
    __ax__ = caObj.plot_coordinates(X=ca_data,
                                    ax=None,
                                    figsize=(20, 12),
                                    x_component=0,
                                    y_component=1,
                                    show_row_labels=True,
                                    show_col_labels=True)
    __ax__.axis("off")
    __ax__.axhline(False)
    __ax__.axvline(False)

    def bubblePlotData(data_ca):
        cols_cnt = ["groups", "count"]
        cols = ["groups", "X", "Y", "group_flag"]
        cols_order = ["group_flag", "groups", "X", "Y"]
        theme_coord = caObj.row_coordinates(data_ca).reset_index()
        theme_coord["group_flag"] = "Themes"
        theme_coord.columns = cols
        emotion_coord = caObj.column_coordinates(data_ca).reset_index()
        emotion_coord["group_flag"] = "Emotions"
        emotion_coord.columns = cols

        coord_data = theme_coord.append(emotion_coord)
        coord_data = coord_data[cols_order]

        freq_dist_theme = data[feature1].value_counts().reset_index()
        freq_dist_theme.columns = cols_cnt
        freq_dist_emotion = data[feature2].value_counts().reset_index()
        freq_dist_emotion.columns = cols_cnt
        freq_dist = freq_dist_theme.append(freq_dist_emotion)
        buble_plot_data = coord_data.merge(freq_dist, on="groups", how="left")

        return buble_plot_data

    bubble_plot_data = bubblePlotData(ca_data)
    bubble_plot_data.to_csv(config['PATHS']['BASEDIR'] + "\\outputs\\" +
                            source + "_bubble_plot_data_" + brand + ".csv",
                            index=False)

    logger.info("Plotting Corresponding Chart...!!!")
    plt.title(feature1 + ' v/s ' + feature2)
    plt.savefig(config['PATHS']['BASEDIR'] + "\\outputs\\" + source + "_CA_" +
                brand + "_" + feature1 + "_" + feature2 + ".png")
예제 #4
0
def conductCA(y):
    ca = prince.CA(n_components=4,
                   n_iter=5,
                   copy=True,
                   check_input=True,
                   engine='auto',
                   random_state=42)
    ca = ca.fit(y)
    phi = ca.column_coordinates(y)
    theta = (phi - phi.mean()) / phi.std()
    rowcoord = ca.row_coordinates(y)
    return (ca, phi, rowcoord, theta)
예제 #5
0
    def correspondence_analysis(self, target=None):
        if target is None:
            target = self.target_variable
        if target == None:
            print(
                "Please specify a categorical column as x-axis using 'target' argument"
            )
            return
        if self.df[target].dtype not in CATEGORICAL_TYPES:
            print("Target must be a categorical column.")
        else:
            num_plots = len(self.cat_cols) - 1
            if num_plots == 0:
                print(
                    "Correspondence Analysis requires at least 2 numerical variables"
                )
            else:
                total_cols = 2
                total_rows = int(np.ceil(num_plots / total_cols))
                fig, axs = plt.subplots(nrows=total_rows,
                                        ncols=total_cols,
                                        figsize=(7 * total_cols,
                                                 7 * total_rows),
                                        constrained_layout=True,
                                        squeeze=False)
                fig.suptitle('Correspondence Analysis for column: ' + target,
                             fontsize=20)
                # exclude target variable
                non_target_list = [i for i in self.cat_cols if i != target]
                for i, col in enumerate(non_target_list):
                    row = i // total_cols
                    pos = i % total_cols
                    X = self.df.copy()
                    X = pd.crosstab(X[col], X[target])

                    ca = prince.CA(n_components=2,
                                   n_iter=3,
                                   copy=True,
                                   check_input=True,
                                   engine='auto',
                                   random_state=42)

                    ca = ca.fit(X)

                    ax = ca.plot_coordinates(X=X,
                                             ax=axs[row][pos],
                                             x_component=0,
                                             y_component=1,
                                             show_row_labels=True,
                                             show_col_labels=True)
예제 #6
0
def calculate_correspondence(data, sup_rows=None, sup_cols=None):
    """
    Calculate correspondence analysis

    Args:
        data (list[list[float]]): Input factor data
        sup_rows (list[int]): Supplementary rows indexes
        sup_cols (list[int]): Supplementary columns indexes

    Returns:
        dict: Calculation results
    """
    if sup_rows is None:
        sup_rows = []
    if sup_cols is None:
        sup_cols = []
    # Calculate correspondence
    df = pd.DataFrame(data)
    df2 = df.loc[~df.index.isin(sup_rows), ~df.columns.isin(sup_cols)]
    ca = prince.CA(n_components=min(df2.shape) - 1).fit(df2)
    # Calculate supplementary factors
    sup_row_res, sup_col_res = None, None
    if sup_rows:
        sup_df = df.loc[df.index.isin(sup_rows), ~df.columns.isin(sup_cols)]
        sup_row_res = sup_df.divide(sup_df.sum(axis=1), axis=0) @ ca.V_.T
    if sup_cols:
        sup_df = df.loc[~df.index.isin(sup_rows), df.columns.isin(sup_cols)]
        sup_col_res = sup_df.divide(sup_df.sum(axis=0), axis=1).T @ ca.U_
    # Calculate quality
    rows = pd.concat([ca.row_coordinates(df2), sup_row_res]).sort_index()
    rows2 = rows**2
    rows_quality = rows2.divide(rows2.sum(axis=1),
                                axis=0).loc[:, :1].sum(axis=1)
    rows_quality.loc[rows_quality.index.isin(sup_rows)] = 0
    cols = pd.concat([ca.column_coordinates(df2), sup_col_res]).sort_index()
    cols2 = cols**2
    cols_quality = cols2.divide(cols2.sum(axis=1),
                                axis=0).loc[:, :1].sum(axis=1)
    cols_quality.loc[cols_quality.index.isin(sup_rows)] = 0
    # Prepare result data
    return {
        'rows': rows.loc[:, :1].values.tolist(),
        'cols': cols.loc[:, :1].values.tolist(),
        'rows_quality': rows_quality.values.tolist(),
        'cols_quality': cols_quality.values.tolist(),
        'explained': [x * 100 for x in ca.explained_inertia_[:2]],
        'eigenvalues': ca.eigenvalues_[:2],
    }
예제 #7
0
 def ca(self, df, col1, col2):
     """
     col1, col2: categorical column names
     
     Do a correspondence analysis
     """
     X = pd.crosstab(df[col1], df[col2])
     ca = prince.CA(n_components=2,
                    n_iter=3,
                    copy=True,
                    check_input=True,
                    engine='auto')
     ca = ca.fit(X)
     fig, ax = plt.subplots(figsize=(10, 10))
     ca.plot_coordinates(X=X,
                         ax=ax,
                         x_component=0,
                         y_component=1,
                         show_row_labels=True,
                         show_col_labels=True)
     plt.show()
     print("Explained inertia:", ca.explained_inertia_)
     return ca
예제 #8
0
 def display(self):
     if self.df is None:
         self.load()
     # display if loaded
     if self.df is not None:
         X = self.df
         ca = prince.CA(n_components=2,n_iter=3,copy=True,check_input=True,engine='auto',random_state=42)
         ca = ca.fit(X)
         coordinat=ca.column_coordinates(X)
         koordinat=ca.row_coordinates(X)
         koordinatx=koordinat.loc[:,0]
         koordinaty=koordinat.loc[:,1]
         coordinatx=coordinat.loc[:,0]
         coordinaty=coordinat.loc[:,1]
         self.text.tag_configure('big', font=('Verdana',10,'bold'), foreground= 'blue')
         self.text.insert('end', 'KOORDINAT VARIABEL\n', 'big', koordinat)
         self.text.insert('end', '\nKOORDINAT OBJEK\n', 'big', coordinat)
     if self.df1 is None:
         self.load1()
     if self.df2 is None:
         self.load2()
     if self.df1 is not None:
         if self.df2 is not None:
             NamaObjek=self.df1
             NamaVariabel=self.df2
             fig1, ax1 = plt.subplots()
             ax1.scatter(koordinatx,koordinaty)
             ax1.scatter(coordinatx,coordinaty)
             ax1.set_title('Plot Analisis Korespondensi')
             for (Objek1, _x1, _y1) in zip(NamaObjek, koordinatx, koordinaty):
                 ax1.annotate(Objek1, (_x1, _y1), color='green')
             for (Variabel1, _x2, _y2) in zip(NamaVariabel, coordinatx, coordinaty):
                 ax1.annotate(Variabel1, (_x2, _y2), color='blue')
             plt.axhline(y=0, xmin=0, xmax=1, linewidth=2, color='k')
             plt.axvline(x=0, ymin=0, ymax=1, linewidth=2, color='k')
     plt.show()
for i, ax in enumerate(axes):
    pc_loadings = loadings.loc[i, :]
    colors = ['C0' if l > 0 else 'C1' for l in pc_loadings]
    ax.axhline(color='#888888')
    pc_loadings.plot.bar(ax=ax, color=colors)
    ax.set_ylabel(f'PC{i+1}')
    ax.set_ylim(-maxPC, maxPC)

plt.tight_layout()
plt.show()

### Correspondence analysis

housetasks = pd.read_csv(HOUSE_TASKS_CSV, index_col=0)

ca = prince.CA(n_components=2)
ca = ca.fit(housetasks)

ca.plot_coordinates(housetasks, figsize=(6, 6))
plt.tight_layout()
plt.show()

## K-Means Clustering
### A Simple Example

df = sp500_px.loc[sp500_px.index >= '2011-01-01', ['XOM', 'CVX']]
kmeans = KMeans(n_clusters=4).fit(df)
df['cluster'] = kmeans.labels_
print(df.head())

centers = pd.DataFrame(kmeans.cluster_centers_, columns=['XOM', 'CVX'])
예제 #10
0
파일: test_ca.py 프로젝트: wangj347/prince
 def test_fit_numpy_array(self):
     ca = prince.CA(n_components=2)
     self.assertTrue(isinstance(ca.fit(self.X.values), prince.CA))
예제 #11
0
파일: test_ca.py 프로젝트: wangj347/prince
 def test_fit_pandas_dataframe(self):
     ca = prince.CA(n_components=2)
     self.assertTrue(isinstance(ca.fit(self.X), prince.CA))
예제 #12
0
import matplotlib.pyplot as plt
import pandas as pd

import prince

df = pd.read_csv('data/woman_work.csv', index_col=0)
df = df[['Stay at home', 'Part-time work', 'Full-time work']]

ca = prince.CA(df, n_components=-1)

fig1, ax1 = ca.plot_cumulative_inertia()
fig2, ax2 = ca.plot_rows_columns(show_row_labels=True, show_column_labels=True)

plt.show()
예제 #13
0
#print(keyList)
indexMainKey = int(sys.argv[2])

X = data.copy()

del X[keyList[0]]

X.rename(index=pd.Series(data[keyList[0]]), inplace=True)
#principalDf = pd.DataFrame(data = data[keyList[indexMainKey:]]
#             , columns = pd.Series(keyList[indexMainKey:]), index=pd.Series(data[keyList[0]]))

print(X)

ca = prince.CA(n_components=3,
               n_iter=25,
               copy=True,
               check_input=True,
               engine='auto',
               random_state=None)

X.columns.rename('Caracteristicas', inplace=True)
X.index.rename('Ciudades', inplace=True)

ca = ca.fit(X)
print(ca.row_coordinates(X))
print(ca.column_coordinates(X))

print(ca.eigenvalues_)

print(ca.total_inertia_)

print(ca.explained_inertia_)
예제 #14
0
파일: CA.py 프로젝트: LuffyLuffy/BCAW-Tool
def CA(file):

    """correspondence analysis.

    Args:
        
        file (directory): csv file contains genes' RSCU values

    Returns:
        - csv file contains genes' values for the first 4 axes of the correspondence analysis result
        - csv file contains codons' values for the first 4 axes of the correspondence analysis result
        - plot the genes first 2 axes values of the correspondence analysis result
        - plot the codons first 2 axes values of the correspondence analysis result
    """
    import pandas as pd
    import prince
    import matplotlib.pyplot as plt

    file = str(file)
    df = pd.read_csv(file)
    df.set_index(df.iloc[:,0] , inplace=True)# to make the first column is the index
    df.drop(df.columns[0], axis=1,inplace= True)
    df.replace(0,0.0000001,inplace=True)



    #with prince # make onle CA for 2 axis
    ca = prince.CA(
        n_components=4,
        n_iter=3,
        copy=True,
        check_input=True,
        engine='auto',
        random_state=42
        )
    df.columns.rename('Gene Name', inplace=True)
    df.index.rename('Codons', inplace=True)
    ca = ca.fit(df)

    codons = ca.row_coordinates(df) # for Codons

    genes = ca.column_coordinates(df) #for genes

    #ca.eigenvalues_
    ca.total_inertia_ #total inertia
    ca.explained_inertia_ #inertia for each axis
    inertia = ca.explained_inertia_

    #save information
    file_genes = file.replace(".csv",'')
    file_genes = file_genes + "genes"
    file_genes = file_genes + ".csv"
    genes.rename(columns={genes.columns[0]: 'axis 1', genes.columns[1]: 'axis 2', genes.columns[2]: 'axis 3', genes.columns[3]: 'axis 4'}, inplace=True)
    genes.to_csv(file_genes,sep=',', index=True, header=True) # return csv file for genes ca result



    file_codons = file.replace(".csv",'')
    file_codons = file_codons+ "codons"
    file_codons = file_codons + ".csv"
    codons.rename(columns={codons.columns[0]: 'axis 1', codons.columns[1]: 'axis 2', codons.columns[2]: 'axis 3', codons.columns[3]: 'axis 4'},inplace=True)
    codons.to_csv(file_codons, sep=',', index=True, header=True) # return csv file for codon ca result


    file_inertia = file.replace('.csv','.txt')

    with open(file_inertia, 'a') as f:
        f.write("explained inertia" + "\n")
    for i in range(len(inertia)):
        i_count = i + 1
        with open(file_inertia,'a') as f:
            f.write ("axis " + str(i_count) + " = " + str(inertia[i]) + "\n" )

    with open(file_inertia,'a') as f:
        f.write("Total Inertia = " + str(ca.total_inertia_))

    #plot For genes
    plt.style.use('seaborn-dark-palette')
    fig = plt.figure()

    plt.xlabel("Axis 1")
    plt.ylabel("Axis 2")
    plt.title("CA-plot")
    plt.scatter(genes['axis 1'],genes['axis 2'],s=10,marker ='o')


    plt.axhline(0, color='black', linestyle='-')
    plt.axvline(0, color='black', linestyle='-')


    save_file_name__ca_plot = file + "_CA_gens_plot.png"
    plt.savefig(save_file_name__ca_plot) # return plot file for gene ca result

    #for codons
    plt.style.use('seaborn-dark-palette')
    fig3 = plt.figure()


    plt.xlabel("Axis 1")
    plt.ylabel("Axis 2")
    plt.title("CA-plot")
    plt.scatter(codons['axis 1'],codons['axis 2'], s=10,marker ='o')

    plt.axhline(0, color='black', linestyle='-')
    plt.axvline(0, color='black', linestyle='-')

    if len(codons) < 200:
        for x , y , t in zip(codons['axis 1'],codons['axis 2'] , codons.index.values):
            x = x * (1 + 0.01)
            y = y * (1 + 0.01)
            plt.text(x,y,t)

    file = file.replace('.csv','')
    save_file_name__ca_codons_plot = file + "_CA_codos_plot.png"
    plt.savefig(save_file_name__ca_codons_plot) # return plot file for codon ca result

    read_genes_file = pd.read_csv(file_genes)
    read_genes_file.rename(columns={genes.columns[0]: 'gene id', genes.columns[1]: 'axis 1', genes.columns[2]: 'axis 2'}, inplace=True)



    return read_genes_file
예제 #15
0
 def test_negative_input(self):
     ca = prince.CA()
     self.X.iloc[0, 0] = -1
     with self.assertRaises(ValueError):
         ca.fit(self.X)
예제 #16
0
 def test_transform_numpy_array(self):
     ca = prince.CA(n_components=2)
     self.assertTrue(isinstance(ca.fit(self.X.values).transform(self.X.values), pd.DataFrame))
예제 #17
0
파일: ca.py 프로젝트: vishalbelsare/prince
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))

import pandas as pd
import prince
from sklearn import datasets

X = pd.read_csv('children.csv', index_col=0)

ca = prince.CA().fit(X)

print('Eigenvalues')
print(ca.eigenvalues_)
print(ca.explained_inertia_)
print('---')

print('U')
print(ca.U_[:5])
print('---')

print('V')
print(ca.V_)
print('---')

print('s')
print(ca.s_)
print('---')

print('Row coords')
print(ca.row_coordinates(X)[:5])
예제 #18
0
 def test_transform_pandas_dataframe(self):
     ca = prince.CA(n_components=2)
     self.assertTrue(isinstance(ca.fit(self.X).transform(self.X), pd.DataFrame))
예제 #19
0
    'Q36', 'Q37', 'Q39c'
]
qqlist1 = []
for q in qqlist:
    qqlist1 += code[q]['qlist']
qqlist1 = qqlist1[:-1]

X = data[qqlist1]
X = X[X.T.notnull().all()]
#X_index=X.index
X = pd.DataFrame(X.as_matrix(), columns=qqlist1)
X = X.fillna(0)

# 分类
est = KMeans(3)  # 4 clusters
est.fit(X)
labels = pd.Series(est.labels_)

# 样本可视化
X_PCA = PCA(2).fit_transform(X)
kwargs = dict(cmap=plt.cm.get_cmap('rainbow', 10), edgecolor='none', alpha=0.6)
plt.scatter(X_PCA[:, 0], X_PCA[:, 1], c=labels, **kwargs)

print(labels.value_counts())
X1 = X.copy()
X1['labels'] = labels
t = X1.groupby(['labels']).mean()
#t=t.rename(columns=code['Q11']['code'])
ca = prince.CA(t)
ca.plot_rows_columns(show_row_labels=True, show_column_labels=True)
import matplotlib.gridspec as gridspec
import numpy as np
import prince
from sklearn.cluster import KMeans
from adjustText import adjust_text
import matplotlib.gridspec as gridspec

pic_type = 'png'

data = pd.read_csv('./birdseed.csv', sep=',', header=0, index_col=0)

data.columns.rename('Bird Type', inplace=True)
data.index.rename('Seed Type', inplace=True)

#Correspondence Analysis using the prince library
ca = prince.CA(n_components=len(data), n_iter=3, copy=True, engine='auto')
ca = ca.fit(data)

#Plot the inertia to justify the use of this method
plt.figure()
plt.plot(np.cumsum(ca.explained_inertia_), label='Ind. Inertia')
plt.plot(ca.explained_inertia_, label='Cum. Sum. of Inertia')
plt.legend(loc=2, fancybox=True, framealpha=1)
plt.title('Correspondence Analysis Inertia')
plt.xlabel('Principal Components')
plt.ylabel('Inertia')
plt.savefig('CA Inertia.png', format='png', bbox_inches='tight')
plt.show(False)

#Let's try to apply k-means algorithm to the CA data to auto-identify clusters
n_clusters = 3