Exemplo n.º 1
0
class ImputationImplementation(DataOperationImplementation):
    """ Class for applying imputation on tabular data

    :param params: optional, dictionary with the arguments
    """
    def __init__(self, **params: Optional[dict]):
        super().__init__()
        if not params:
            # Default parameters
            self.imputer = SimpleImputer()
        else:
            self.imputer = SimpleImputer(**params)
        self.params = params

    def fit(self, input_data):
        """
        The method trains SimpleImputer

        :param input_data: data with features
        :return imputer: trained SimpleImputer model
        """

        self.imputer.fit(input_data.features)
        return self.imputer

    def transform(self, input_data, is_fit_chain_stage: Optional[bool] = None):
        """
        Method for transformation tabular data using SimpleImputer

        :param input_data: data with features
        :param is_fit_chain_stage: is this fit or predict stage for chain
        :return input_data: data with transformed features attribute
        """
        transformed_features = self.imputer.transform(input_data.features)

        # Update features
        output_data = self._convert_to_output(input_data, transformed_features)
        return output_data

    def get_params(self):
        return self.imputer.get_params()
vectorizer.get_stop_words()
vectorizer.get_feature_names()

X = vectorizer.transform(corpus)
type(X)
X.shape
X.toarray()

import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_data = [[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]
imp_mean.fit(imp_data)
imp_mean.statistics_
X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
imp_mean.get_params()
imp_mean.transform(X)

from sklearn.impute import MissingIndicator
X1 = np.array([[np.nan, 1, 3], [4, 0, np.nan], [8, 1, 0]])
X2 = np.array([[5, 1, np.nan], [np.nan, 2, 3], [2, 4, 0]])
indicator = MissingIndicator()
indicator.fit(X1)
indicator.features_
X1
indicator.transform(X1)
X2
indicator.transform(X2)

indicator_all = MissingIndicator(features='all')
indicator_all.fit_transform(X1)
Exemplo n.º 3
0
vec_data = vec.fit_transform(features_array).toarray()
feature_names = vec.get_feature_names()
vec_lables = vec.fit_transform(lables).toarray().ravel()
print('data vectorized')

#--------------impute missing data
imp = SimpleImputer(missing_values=np.nan,
                    strategy='constant',
                    fill_value=0,
                    copy=False,
                    verbose=True)
#imp = SimpleImputer(missing_values=np.nan, strategy = 'mean', copy=False, verbose = 1)
imp.fit(vec_data)
vec_data = imp.transform(vec_data)
joblib.dump(imp, 'imputer.joblib')
print('imputation completed: ', imp.get_params())
print('splitting...')
train_data, test_data, train_lables, test_lables = train_test_split(
    vec_data, vec_lables, test_size=test_size, random_state=0)
print('data splitted')

labled_data['feature_names'] = feature_names
labled_data['train_data'] = train_data
labled_data['test_data'] = test_data
labled_data['train_lables'] = train_lables
labled_data['test_lables'] = test_lables

print('saving...')
ws_path = os.path.expanduser('~/robot_host_ws/')
data_path = os.path.join(
    ws_path, 'Classifiers/' + str(sys.argv[1]).split('/')[-1].split('.')[0] +