class ImputationImplementation(DataOperationImplementation): """ Class for applying imputation on tabular data :param params: optional, dictionary with the arguments """ def __init__(self, **params: Optional[dict]): super().__init__() if not params: # Default parameters self.imputer = SimpleImputer() else: self.imputer = SimpleImputer(**params) self.params = params def fit(self, input_data): """ The method trains SimpleImputer :param input_data: data with features :return imputer: trained SimpleImputer model """ self.imputer.fit(input_data.features) return self.imputer def transform(self, input_data, is_fit_chain_stage: Optional[bool] = None): """ Method for transformation tabular data using SimpleImputer :param input_data: data with features :param is_fit_chain_stage: is this fit or predict stage for chain :return input_data: data with transformed features attribute """ transformed_features = self.imputer.transform(input_data.features) # Update features output_data = self._convert_to_output(input_data, transformed_features) return output_data def get_params(self): return self.imputer.get_params()
vectorizer.get_stop_words() vectorizer.get_feature_names() X = vectorizer.transform(corpus) type(X) X.shape X.toarray() import numpy as np from sklearn.impute import SimpleImputer imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') imp_data = [[7, 2, 3], [4, np.nan, 6], [10, 5, 9]] imp_mean.fit(imp_data) imp_mean.statistics_ X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] imp_mean.get_params() imp_mean.transform(X) from sklearn.impute import MissingIndicator X1 = np.array([[np.nan, 1, 3], [4, 0, np.nan], [8, 1, 0]]) X2 = np.array([[5, 1, np.nan], [np.nan, 2, 3], [2, 4, 0]]) indicator = MissingIndicator() indicator.fit(X1) indicator.features_ X1 indicator.transform(X1) X2 indicator.transform(X2) indicator_all = MissingIndicator(features='all') indicator_all.fit_transform(X1)
vec_data = vec.fit_transform(features_array).toarray() feature_names = vec.get_feature_names() vec_lables = vec.fit_transform(lables).toarray().ravel() print('data vectorized') #--------------impute missing data imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0, copy=False, verbose=True) #imp = SimpleImputer(missing_values=np.nan, strategy = 'mean', copy=False, verbose = 1) imp.fit(vec_data) vec_data = imp.transform(vec_data) joblib.dump(imp, 'imputer.joblib') print('imputation completed: ', imp.get_params()) print('splitting...') train_data, test_data, train_lables, test_lables = train_test_split( vec_data, vec_lables, test_size=test_size, random_state=0) print('data splitted') labled_data['feature_names'] = feature_names labled_data['train_data'] = train_data labled_data['test_data'] = test_data labled_data['train_lables'] = train_lables labled_data['test_lables'] = test_lables print('saving...') ws_path = os.path.expanduser('~/robot_host_ws/') data_path = os.path.join( ws_path, 'Classifiers/' + str(sys.argv[1]).split('/')[-1].split('.')[0] +