예제 #1
0
 def test_transform_numpy_array(self):
     groups = {
         name: [self.X.columns.get_loc(col) for col in cols]
         for name, cols in self.groups.items()
     }
     mfa = prince.MFA(groups=groups)
     self.assertTrue(isinstance(mfa.fit(self.X.values).transform(self.X.values), pd.DataFrame))
예제 #2
0
 def test_plot_partial_row_coordinates(self):
     mfa = prince.MFA(groups=self.groups)
     for col in ['E1 fruity', 'E1 woody', 'E1 coffee']:
         self.X[col] = self.X[col].astype(str)
     mfa.fit(self.X)
     ax = mfa.plot_partial_row_coordinates(self.X)
     self.assertTrue(isinstance(ax, mpl.axes.Axes))
예제 #3
0
 def test_fit_numpy_array(self):
     groups = {
         name: [self.X.columns.get_loc(col) for col in cols]
         for name, cols in self.groups.items()
     }
     mfa = prince.MFA(groups=groups,
                      rescale_with_mean=False,
                      rescale_with_std=False)
     self.assertTrue(isinstance(mfa.fit(self.X.values), prince.MFA))
예제 #4
0
파일: mfa.py 프로젝트: vishalbelsare/prince
    'ens': 2
}

i = 0
groups = {}

for name, n in group_sizes.items():
    groups[name] = variables[i:i + n]
    i += n


mfa = prince.MFA(
    groups=groups,
    n_components=5,
    n_iter=3,
    copy=True,
    check_input=True,
    engine='auto',
    random_state=42
)
mfa = mfa.fit(X)

# print('ORIG')
# print(mfa.partial_factor_analysis_['orig'].eigenvalues_)
# print(mfa.partial_factor_analysis_['orig'].s_)
# print('---')

# print('VIS')
# print(mfa.partial_factor_analysis_['vis'].eigenvalues_)
# print(mfa.partial_factor_analysis_['vis'].s_)
# print('---')
예제 #5
0
def Preprocess(data_frame,
               target=None,
               method='FAMD',
               samples=None,
               mapper=None,
               num_components=3,
               scaler=None,
               encode_method='Binary',
               target_encoder=None,
               data_encoder=None,
               data_columns_dict=None,
               target_column_dict=None,
               groups=None,
               normalization='l2'):

    # If no target supplied get as target the last column of df
    if not target: target = data_frame.columns.values.tolist()[-1]
    ''' TO DO: Fix PCA. '''
    if method == 'PCA':
        print('Dummy is not functionning proberly.')
        method = 'MFA'

    normalization = normalization.lower()
    if normalization not in ['l1', 'l2', 'max', 'standard', None]:
        print('Not a valid normalization method change to None')
        normalization = None

    if samples is not None:

        # Sample the data set, Split to training and testing sets.
        train_data = data_frame.loc[samples.iloc[:, :-1].values.flatten(), :]
        test_data = data_frame.loc[samples.iloc[:, -1].values.flatten(), :]
        train_target = train_data[target].copy()
        test_target = test_data[target].copy()
        train_data = train_data.drop(columns=[target])
        test_data = test_data.drop(columns=[target])

        # Encode the data sets
        train_data, data_encoder, data_columns_dict = Fit_Encode(
            train_data, method=encode_method)
        test_data, _, _ = Fit_Encode(test_data,
                                     mappings=data_encoder,
                                     columns_dict=data_columns_dict,
                                     method=encode_method)
        #print('Test','\n',train_data.iloc[0])
        train_target, target_encoder, target_column_dict = Fit_Encode(
            train_target, method=encode_method)
        test_target, _, _ = Fit_Encode(test_target,
                                       mappings=target_encoder,
                                       columns_dict=target_column_dict,
                                       method=encode_method)

    else:  # If no samples are supplied we process the entire data set as a whole.
        test_data = data_frame.copy()
        test_target = test_data[target].copy()
        test_data = test_data.drop(columns=[target])
        test_data, test_data_encoder, test_columns_dict = Fit_Encode(
            test_data,
            mappings=data_encoder,
            columns_dict=data_columns_dict,
            method=encode_method)
        print('Test Data Encoded')
        test_target, test_target_encoder, _ = Fit_Encode(
            test_target,
            mappings=target_encoder,
            columns_dict=target_column_dict,
            method=encode_method)
        print('Test targets encoded')

        # Drop the income column from data sets and get normalized vectors

    if method == 'MFA':

        if not groups:
            groups = {}
            for key in data_columns_dict.keys():
                names = ['_' + s for s in data_columns_dict[key]]
                column_headers = [x + y for x, y in it.product([key], names)]
                groups[key] = column_headers

        if not mapper:  # Create FAMD mapper.
            print('No mapper found')
            ''' Consider passing **kwargs in Preprocess func. to pass in mappers. '''
            mfa = pr.MFA(
                groups=groups,
                n_components=num_components,
                n_iter=100,
                #rescale_with_mean = True, # Does not work. Can use sklearn Standard scaller.
                #rescale_with_std = True,
                copy=True,
                check_input=True,
                engine='auto',
                random_state=None)

        print('Fitting MFA')
        if samples is not None:

            # Vectors for training/test set
            mapper = mfa.fit(train_data)
            vecs_train = pd.DataFrame(mapper.row_coordinates(train_data))
            vecs_test = pd.DataFrame(mapper.transform(test_data))

            vecs_train, scaler = Normalization(vecs_train, normalization,
                                               scaler)
            vecs_test, scaler = Normalization(vecs_test, normalization, scaler)

            return vecs_train, train_target, vecs_test, test_target, data_columns_dict, target_column_dict, data_encoder, target_encoder, groups, target, mapper, scaler

        else:
            # Get the vectors created for the training set and normalise
            vecs_test = pd.DataFrame(mapper.transform(test_data))

            vecs_test, scaler = Normalization(vecs_test, normalization, scaler)
            ''' Consider returning a single dictionary with all parameters. Each case has 
                different number of returned variables.'''

            return vecs_test, test_target, test_data_encoder, test_target_encoder, mapper, target, scaler

    elif method == 'PCA':

        if not mapper:

            mapper = pr.PCA(n_components=num_components,
                            n_iter=100,
                            rescale_with_mean=True,
                            rescale_with_std=True,
                            copy=True,
                            check_input=True,
                            engine='auto',
                            random_state=None)

        if samples is not None:

            pca_train = mapper.fit(train_data)
            vecs_train = pd.DataFrame(pca_train.row_coordinates(train_data))
            pca_test = mapper.transform(test_data)
            vecs_test = pd.DataFrame(pca_test.row_coordinates(test_data))

            if normalization in ['l1', 'l2', 'max']:
                scaler = None
                vecs_train = pd.DataFrame(preprocessing.normalize(
                    vecs_train, norm=normalization, axis=1),
                                          columns=vecs_train.columns)

                vecs_test = pd.DataFrame(preprocessing.normalize(
                    vecs_test, norm=normalization, axis=1),
                                         columns=vecs_test.columns)

            elif normalization == 'standard':
                scaler = preprocessing.StandardScaler()
                vecs_train = pd.DataFrame(scaler.fit_transform(vecs_train),
                                          columns=vecs_train.columns)
                vecs_test = pd.DataFrame(scaler.fit_transform(vecs_test),
                                         columns=vecs_test.columns)

            return vecs_train, train_target, vecs_test, test_target, target_encoders, data_endoder, mapper, target, scaler

        else:

            test_data, data_endoder = encode_categorical(
                test_data[target].copy(),
                encode_method=encode_method,
                encoder=data_encoder)

            pca_test = mapper.fit(test_data)
            vecs_test = pd.DataFrame(pca_test.row_coordinates(test_data))

            if normalization in ['l1', 'l2', 'max']:
                scaler = None
                vecs_test = pd.DataFrame(preprocessing.normalize(
                    vecs_test, norm=normalization, axis=1),
                                         columns=vecs_test.columns)

            elif normalization == 'standard':
                scaler = preprocessing.StandardScaler()
                vecs_test = pd.DataFrame(scaler.fit_transform(vecs_test),
                                         columns=vecs_test.columns)

            return vecs_test, test_target, mapper, target
예제 #6
0
 def test_transform_pandas_dataframe(self):
     mfa = prince.MFA(groups=self.groups,
                      rescale_with_mean=False,
                      rescale_with_std=False)
     self.assertTrue(
         isinstance(mfa.fit(self.X).transform(self.X), pd.DataFrame))
예제 #7
0
 def test_fit_pandas_dataframe(self):
     mfa = prince.MFA(groups=self.groups,
                      rescale_with_mean=False,
                      rescale_with_std=False)
     self.assertTrue(isinstance(mfa.fit(self.X), prince.MFA))
예제 #8
0
 def test_transform_pandas_dataframe(self):
     mfa = prince.MFA(groups=self.groups)
     self.assertTrue(isinstance(mfa.fit(self.X).transform(self.X), pd.DataFrame))
예제 #9
0
 def test_fit_pandas_dataframe(self):
     mfa = prince.MFA(groups=self.groups)
     self.assertTrue(isinstance(mfa.fit(self.X), prince.MFA))
예제 #10
0
 def test_mixed_groups(self):
     mfa = prince.MFA(groups=self.groups)
     self.X['E1 fruity'] = self.X['E1 fruity'].astype('category')
     with self.assertRaises(ValueError):
         mfa.fit(self.X)
예제 #11
0
 def test_no_groups(self):
     mfa = prince.MFA()
     with self.assertRaises(ValueError):
         mfa.fit(self.X)
예제 #12
0
# Printing basic info of dataset
print ('Number of records:',data.shape[0])
print ('Number of attributes:',data.shape[1])


# Printing Column names
print([a for a in data.columns])

raw_data = data.drop(['timestamp','group'],axis=1)

# Stadardizing the dataset
#std_rawdata = preprocessing.StandardScaler().fit_transform(raw_data)
"""
mca = prince.MCA(n_components =2, n_iter=3,copy=True,engine='auto')
mca = mca.fit(raw_data)
"""

groups ={'physical':['disengaged','looking','talking','intTech','intRes','intExt'],'logs':['Accessed','Create','Open','Update']}



mfa = prince.MFA(groups=groups,n_components = 2)
mfa = mfa.fit(raw_data)


#mcadf = mca.row_coordinates(raw_data)
#mcadf.to_csv('mcaresult.csv')

mfadf = mfa.row_coordinates(raw_data)
mfadf.to_csv('mfaresult2.csv')
data_ready_8nov = data_8nov.drop(data_8nov.columns[[0,1,2]],axis=1)
data_ready_22nov = data_22nov.drop(data_22nov.columns[[0,1,2]],axis=1)
data_ready_6dec = data_6dec.drop(data_6dec.columns[[0,1]],axis=1)

data_ready_22nov.dropna(axis=0,how="any",inplace=True)

data_ready_18oct = pd.DataFrame(scaler1.fit_transform(data_ready_18oct),columns=group_all)
data_ready_8nov = pd.DataFrame(scaler2.fit_transform(data_ready_8nov),columns=group_all)
data_ready_22nov = pd.DataFrame(scaler3.fit_transform(data_ready_22nov),columns=group_all)
data_ready_6dec = pd.DataFrame(scaler4.fit_transform(data_ready_6dec),columns=group_partial)




#famd = prince.FAMD(n_components = 2)
mfa1 = prince.MFA(groups=groups,n_components=2)
mfa2 = prince.MFA(groups=groups,n_components=2)
mfa3 = prince.MFA(groups=groups,n_components=2)
mfa4 = prince.MFA(groups=groups1,n_components=2)

#famd_result = famd.fit_transform(std_data_8nov)
mfa_result_18oct = mfa1.fit_transform(data_ready_18oct)
mfa_result_8nov = mfa2.fit_transform(data_ready_8nov)
mfa_result_22nov = mfa3.fit_transform(data_ready_22nov)
mfa_result_6dec = mfa4.fit_transform(data_ready_6dec)

#famd.to_csv('famd_result.csv')
mfa_result_18oct.to_csv('mfa_result_18oct.csv')
mfa_result_8nov.to_csv('mfa_result_8nov.csv')
mfa_result_22nov.to_csv('mfa_result_22nov.csv')
mfa_result_6dec.to_csv('mfa_result_6dec.csv')