def VarianceThreshold_selector(data, th): #Select Model selector = VarianceThreshold( th ) #Defaults to 0.0, e.g. only remove features with the same value in all samples #Fit the Model selector.fit(data) features = selector.get_support( indices=True ) #returns an array of integers corresponding to nonremoved features features = [column for column in data[features] ] #Array of all nonremoved features' names #Format and Return selector = pd.DataFrame(selector.transform(data)) selector.columns = features return selector
def VarianceThreshold_selector(data): #Select Model selector = VarianceThreshold(threshold=(.8 * (1 - .8))) #Fit the Model selector.fit(data) features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features #print (features) Features = list(data) features = [Features[i] for i in features] #features = [column for column in data[features]] #Array of all nonremoved features' names #print (features) #Format and Return selector = pd.DataFrame(selector.transform(data)) selector.columns = features return selector
def variance_threshold(self: pd.DataFrame, cp, fecha, threshold=0.0): """ VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples. As an example, suppose that we have a dataset with boolean features, and we want to remove all features that are either one or zero (on or off) in more than 80% of the samples. """ column_names = self.columns.values.tolist() key_variables = ['id_siniestro', 'id_poliza', 'cod_filiacion'] + cp + fecha removed_var = [] for i in key_variables: try: column_names.remove(i) removed_var.append(i) except: pass append_names = [] for i in column_names: self_i = self[[i]] self_i = self_i.apply(pd.to_numeric, errors='coerce') self_i = self_i.dropna(how='any', axis=0) selection = VarianceThreshold(threshold=threshold) try: selection.fit(self_i) features = selection.get_support(indices=True) features = self_i.columns[features] features = [column for column in self_i[features]] selection = pd.DataFrame(selection.transform(self_i), index=self_i.index) selection.columns = features append_names.append(selection.columns.values.tolist()) except: pass append_names = [item for sublist in append_names for item in sublist] append_names = list(set(append_names)) self = self[removed_var + append_names] return self