def task3(file_path): binning3 = preprocessing.KBinsDiscretizer(n_bins=3) binning6 = preprocessing.KBinsDiscretizer(n_bins=6) binning9 = preprocessing.KBinsDiscretizer(n_bins=9) data = pd.read_csv( file_path, usecols=( 7, # loan - classifier 16, # bank_arg1 18, # y - classifier )) labels = data.columns.values timer = TicToc() # Data clean-up data = data_transform(data) print_table(data.head()) X_data = (data.drop("bank_arg1", 1)) Y_data = (data["bank_arg1"]) # Training/testing sets X_train = X_data[:-2500] X_test = X_data[-2500:] Y_train = Y_data[:-2500] Y_test = Y_data[-2500:] timer.tic() binning3.fit(X_data) prediction3 = binning3.transform(X_data) timer.toc() timer.tic() binning6.fit(X_data) prediction6 = binning6.transform(X_data) timer.toc() timer.tic() binning9.fit(X_data) prediction9 = binning9.transform(X_data) timer.toc() # TODO: Fix evaluation for matrix acc3 = evaluate_result(prediction3, Y_data.values) acc6 = evaluate_result(prediction6, Y_data.values) acc9 = evaluate_result(prediction9, Y_data.values) print(f"Binning with 3 units: {acc3}% matched in {timer.elapsed}s") print(f"Binning with 6 units: {acc6}% matched in {timer.elapsed}s") print(f"Binning with 9 units: {acc9}% matched in {timer.elapsed}s")
def preprocess(self, file_name): """ Preprocesses the data for the Random Forest so that it is properly formatted for our training model. """ names = [ "age", "wrk_cls", "edu", "edu_num", "marital_sts", "occu_code", "relation", "race", "sex", "gain", "loss", "hours", "country", "income" ] original_data = pd.read_csv(file_name, dtype=object, names=names, skipinitialspace=True) original_data = pd.DataFrame(original_data) imp = SimpleImputer(missing_values='?', strategy='most_frequent', verbose=2) data = imp.fit_transform(original_data) enc = preprocessing.OrdinalEncoder() data = enc.fit_transform(data) est = preprocessing.KBinsDiscretizer(n_bins=8, encode="ordinal") data = est.fit_transform(data) return data
def kbinData(self): print("kbinData") training_data = [] with open('training_data_set.txt', 'rb') as fp: training_set = pickle.load(fp) npa = np.empty([0, 70], dtype=int) ts_del_list = [] for ts in training_set: try: npa = np.append(npa, np.array(training_set[ts]["d"]), axis=0) except: ts_del_list.append(ts) continue for tdl in ts_del_list: del training_set[tdl] est = [] for n in npa.transpose(): est.append( preprocessing.KBinsDiscretizer(n_bins=10, encode='ordinal').fit_transform( n.reshape(-1, 1))) np_est = np.array(est).transpose()[0] self.kbinChart(np_est) i = 0 for ts in training_set: training_set[ts]["kbinD"] = np_est[i] i += 1 with open('training_data_set_kbin', 'wb') as fp: pickle.dump(training_set, fp, protocol=pickle.HIGHEST_PROTOCOL) print("done")
def __init__(self, column, dataframe, settings): AEncoder.__init__(self, column, dataframe, settings) self.encoder = preprocessing.KBinsDiscretizer( n_bins=self.settings.get('n_bins', 5), encode=self.settings.get('encode', 'onehot'), strategy=self.settings.get('strategy', 'quantile')) self.pickle_process(dataframe)
def execute(self, df: data.Frame) -> data.Frame: frame = copy.deepcopy(df) f = frame.getRawFrame() # Operation ignores nan values nanRows = f.iloc[:, list(self.__attributes.keys())].isnull() # For every column, transform every non-nan row columns = f.columns edges: Dict[int, List[float]] = dict() for col, k in self.__attributes.items(): colName = columns[col] notNa = (~nanRows.loc[:, colName]).to_list() discretizer = skp.KBinsDiscretizer(n_bins=k, encode='ordinal', strategy=self.__strategy.value) # Discretize and convert to string (since categories are strings) result = discretizer.fit_transform(f.loc[notNa, colName].values.reshape(-1, 1)).astype(str) name: str = colName if self.__attributeSuffix: # Make a new column with all nans name = colName + self.__attributeSuffix f.loc[:, name] = np.nan # Assign column f.loc[notNa, [name]] = result f.loc[:, name] = f[name].astype( pd.CategoricalDtype(categories=[str(float(i)) for i in range(k)], ordered=True)) edges[col] = discretizer.bin_edges_[0].tolist() # Log what has been done self.__logExecution(columns, edges) return data.Frame(f)
def __init__( self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Union[typing.Dict[ str, base.DockerContainer]] = None ) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._index = None self._training_inputs = None self._training_outputs = None self._origin_inputs = None #for label encoder self._fitted = False self._cate_flag = None self._clf = Model('nb', bayesInf=1, PointInf=0) #classifier self._LEoutput = preprocessing.LabelEncoder() #label encoder self._Imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') #imputer self._nbins = 10 self._Kbins = preprocessing.KBinsDiscretizer( n_bins=self._nbins, encode='ordinal', strategy='uniform') #KbinsDiscretizer self._discTrainset = None
def __init__( self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Union[typing.Dict[ str, base.DockerContainer]] = None ) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._index = None self._problem_type = 'classification' self._training_inputs = None self._training_outputs = None self._fitted = False self._cate_flag = None self._LEoutput = preprocessing.LabelEncoder() # label encoder self._Imputer = SimpleImputer( missing_values=np.nan, strategy='mean') #self.hyperparams['Imputer_Strategy']) # imputer self._nbins = self.hyperparams['nbins'] self._Kbins = preprocessing.KBinsDiscretizer( n_bins=self._nbins, encode='ordinal', strategy='uniform' ) #self.hyperparams['Discretizer_Strategy']) #KbinsDiscretizer
def __init__(self, data, k=4, leaf_size=40, lambdav=.5, lap_vec=None, bins=None): self.tree = KDTree(data, leaf_size=leaf_size) self.data = data if bins is not None: self.is_binned = True self.discretizer = preprocessing.KBinsDiscretizer( n_bins=bins, encode='ordinal').fit(data) self.data_binned = self.discretizer.inverse_transform( self.discretizer.transform(data)) self.bin_tree = KDTree(self.data_binned, leaf_size=leaf_size) self.k = k self.lambdav = lambdav self.knn_dist_arr, self.knn_idx_arr = self.knn(data) self._pdist = self.pdist(data, self.knn_dist_arr, lap_vec) self._plof, self.nplof = self.plof(data, self.knn_idx_arr, self._pdist) self.loop_values = self.loop(self._plof)
def get_outside_feature(data): est = preprocessing.KBinsDiscretizer(n_bins=5, encode='onehot-dense') new = est.fit_transform(sourse[['收率']]) new = pd.DataFrame(data=new) new_feature = [i for i in new.columns] sourse[new_feature] = new data = data.copy() len_first = len(data.columns) for i in data.columns: order_mean = sourse.groupby(i)['收率'].mean() order_max = sourse.groupby(i)['收率'].max() order_min = sourse.groupby(i)['收率'].min() order_std = sourse.groupby(i)['收率'].std().fillna(0) order_sum = sourse.groupby(i)['收率'].sum() order_median = sourse.groupby(i)['收率'].median() # data['new_'+i+'_mean'] = data[i].map(order_mean) # data['new_'+i+'_max'] = data[i].map(order_max) # data['new_'+i+'_min'] = data[i].map(order_min) # data['new_'+i+'_std'] = data[i].map(order_std) # data['new_'+i+'_sum'] = data[i].map(order_sum) # data['new_'+i+'_median'] = data[i].map(order_median) # for j in new_feature: # order_label = sourse.groupby(i)[j].count() # data['new_'+i+'_'+str(j)] = data[i].map(order_label) data = data.iloc[:, len_first:].copy() return data
def preprocess(self, file_name): """ Method to preprocess data to be in a format that is conducive to training. """ names = ("age", "wrk_cls", "edu", "edu_num", "marital_sts", "occu_code", "relation", "race", "sex", "gain", "loss", "hours", "country", "income") original_data = pd.read_csv(file_name, dtype=object, names=names, skipinitialspace=True) original_data = pd.DataFrame(original_data) imp = SimpleImputer(missing_values='?', strategy='most_frequent', verbose=2) data = imp.fit_transform(original_data) enc = preprocessing.OrdinalEncoder() data = enc.fit_transform(data) est = preprocessing.KBinsDiscretizer(encode="ordinal") data = est.fit_transform(data) return data
def __init__( self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Union[typing.Dict[ str, base.DockerContainer]] = None ) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) #parameters self._index = None self._fitted = False #hyperparameters self._nbins = self.hyperparams['nbins'] self._strategy = self.hyperparams['strategy'] self._method = self.hyperparams['method'] self._thres_search_method = self.hyperparams['thres_search_method'] self._threshold = self.hyperparams['threshold'] self._bayes_factors = self.hyperparams['bayesfactors'] self._problem_type = self.hyperparams['problem_type'] #other parameters self._training_inputs = None self._training_outputs = None self._cate_flag = None self._LEoutput = preprocessing.LabelEncoder() # label encoder self._Imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') # imputer self._Kbins = preprocessing.KBinsDiscretizer( n_bins=self._nbins, encode='ordinal', strategy=self._strategy ) #self.hyperparams['Discretizer_Strategy']) #KbinsDiscretizer
def fit(self, x_train, y_train): self.class_ = unique_labels(y_train) self.x_train = x_train self.y_train = y_train self.candidates = list() self.predict_table = dict() self.predict_table_index = 0 bins = [len(self.class_)] * len(self.x_train[0]) bins = preprocessing.KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='uniform').fit(self.x_train).transform(self.x_train) iterations = len(self.x_train[0]) for i in range(iterations): cross = pd.crosstab(bins[:,i], self.y_train) rules = self.gen_table_rules(cross) self.candidates.append(rules) self.predict_table_index = self.best_predict_table_index(bins, self.y_train, self.candidates) self.predict_table = self.candidates[self.predict_table_index] selected_column = x_train[:,self.predict_table_index] collection_of_classes = dict() self.rule_set = dict() for k in self.class_: collection_of_classes[k] = list() for index, element in enumerate(selected_column): collection_of_classes[y_train[index]].append(element) for key, value in collection_of_classes.items(): self.rule_set[key] = sum(value) / len(value)
def fit(self, x_train, y_train): self.class_ = unique_labels(y_train) opt_acc = 0 # Discretizando bins = preprocessing.KBinsDiscretizer( n_bins=([len(self.class_)] * len(x_train[0])), encode='ordinal', strategy='uniform').fit(x_train).transform(x_train) # Fazendo as tabelas for i in range(0, len(x_train[0])): table = pd.crosstab(bins[:, i], y_train) # Cria as tabelas de contingencia rule = self.gen_rule(table) # Cria a regra a partir da tabela if (self.acc_train(rule, bins[:, i], y_train)) >= opt_acc: self.index = i # Achando o centroide selected_column = x_train[:, self.index] class_lists = dict() self.rule_set = dict() for val in self.class_: class_lists[val] = [] for i in range(0, len(selected_column)): class_lists[y_train[i]].append(selected_column[i]) for key, value in class_lists.items(): self.rule_set[key] = sum(value) / len(value)
def q2(): #Criação de variável que representa intevalos, semelhante ao pd.cut discretizer = preprocessing.KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') discretizer.fit(countries[['Pop_density']]) return int(sum(score_bins[:, 0] >= 9))
def to_categorial(data_pd_series, n_bins=10): bins_transform = data_pd_series.to_numpy().reshape(-1, 1) descritizer = preprocessing.KBinsDiscretizer( n_bins=n_bins).fit(bins_transform) tmp1 = pd.DataFrame( descritizer.transform(bins_transform).todense()).stack() return pd.Series(pd.Categorical( tmp1[tmp1 != 0].index.get_level_values(1))), descritizer.bin_edges_
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: # inputs: m x n numpy array if self._fitted: # get discrete bins from training data discTrainset = RelationSet(self._training_inputs, self._training_outputs.reshape(-1, 1)) discTrainset.impute() discTrainset.discretize() discTrainset.remove() bins = discTrainset.NUM_STATES # convert categorical values to numerical values in testing data metadata = inputs.metadata [m, n] = inputs.shape X_test = np.zeros((m, n)) for column_index in metadata.get_elements((metadata_base.ALL_ELEMENTS,)): if column_index is metadata_base.ALL_ELEMENTS: continue column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) semantic_types = column_metadata.get('semantic_types', []) if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types: LE = preprocessing.LabelEncoder() LE = LE.fit(inputs.iloc[:, column_index]) X_test[:, column_index] = LE.transform(inputs.iloc[:, column_index]) elif 'http://schema.org/Text' in semantic_types: pass else: temp = list(inputs.iloc[:, column_index].values) for i in np.arange(len(temp)): if bool(temp[i]): X_test[i, column_index] = float(temp[i]) else: X_test[i, column_index] = 'nan' discTestset = RelationSet(X_test, []) discTestset.impute() X_test = discTestset.data index_list = np.setdiff1d(np.arange(discTrainset.num_features),np.array(discTrainset.removeIdx)) X_test = X_test[:, index_list] est = preprocessing.KBinsDiscretizer(n_bins=bins,encode='ordinal',strategy='uniform') est.fit(X_test) X_test = est.transform(X_test) t = time.time() output = self._clf.predict(X_test) self._time = time.time() - t + self._time if min(self._training_outputs) == 1: output = output + 1 # label decode output = self._LEoutput.inverse_transform(output) # update metadata output = container.DataFrame(output, generate_metadata=False, source=self) output.metadata = inputs.metadata.clear(source=self, for_value=output, generate_metadata=True) for column_index, column_metadata in enumerate(self._target_columns_metadata): output.metadata = output.metadata.update_column(column_index, column_metadata, source=self) return CallResult(output) else: raise ValueError('Model should be fitted first.')
def q2(): est = preprocessing.KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') pop_density_10bins = est.fit_transform(countries[['Pop_density']]) plt.hist(pop_density_10bins) above_90 = np.sum(pop_density_10bins >= 9) # (pop_density_10bins > np.percentile(pop_density_10bins, 90)).sum() return above_90
def __init__(self, X_train, y_train): """ Creates a new SalaryPredictor trained on the given features from the preprocessed census data to predicted salary labels. Performs and fits any preprocessing methods (e.g., imputing of missing features, discretization of continuous variables, etc.) on the inputs, and saves these as attributes to later transform test inputs. :param DataFrame X_train: Pandas DataFrame consisting of the sample rows of attributes pertaining to each individual :param DataFrame y_train: Pandas DataFrame consisting of the sample rows of labels pertaining to each person's salary """ # [!] TODO #### #defined columns columns_to_encode=["work_class","education","marital","occupation_code","relationship","race","sex","country"] #columns_to_scale=["age","education_years","capital_gain","capital_loss","hours_per_week"] columns_to_scale=["age","education_years","hours_per_week"] columns_to_leave= ["capital_gain","capital_loss"] self.columns_with_missing=["work_class","occupation_code","country"] #get the categories for each column cat = [] for col in X_train.columns: if pd.api.types.is_string_dtype(X_train[col]): cat.append(X_train[col].unique()) #print(cat) self.imp = SimpleImputer(missing_values = "?", strategy="most_frequent") imp_columns = self.imp.fit_transform(X_train[columns_to_encode]) self.ohe = preprocessing.OneHotEncoder( handle_unknown = "ignore",categories = cat,sparse = False) self.le2 = preprocessing.LabelEncoder() self.Kbin = preprocessing.KBinsDiscretizer(n_bins=[2,2,2],encode="ordinal") self.scale = preprocessing.StandardScaler() #scaled_columns = self.scale.fit_transform(X_train[columns_to_scale]) #ds_columns = self.Kbin.fit_transform(scaled_columns) scaled_columns = self.scale.fit_transform(X_train[columns_to_leave]) ds_columns = self.Kbin.fit_transform(X_train[columns_to_scale]) #print(X_train) encoded_columns = self.ohe.fit_transform(imp_columns) #print(X_train) #print(encoded_columns) #for col in encoded_columns: #print(col) # if col == "work_class" or col == "occupation_code" or col == "country": # encoded_columns[col].replace([0],-1) processed_data = np.concatenate([ds_columns,encoded_columns,scaled_columns],axis=1) #self.imp = SimpleImputer(missing_values = -1, strategy="most_frequent") #X_train = self.imp.fit_transform(processed_data) self.le = preprocessing.LabelEncoder() #y_process = self.le.fit_transform(y_train) #print(y_process) self.clf = LogisticRegression(max_iter=1000).fit(processed_data,y_train)
def predict(self, x_test): predict = list() bins = [len(self.class_)] * len(x_test[0]) bins = preprocessing.KBinsDiscretizer( n_bins=bins, encode='ordinal', strategy='uniform').fit(x_test).transform(x_test) for element in bins[:, self.predict_table_index]: predict.append(self.predict_table[element]) return predict
def fit(self, data): # row = data[self.k].values # X = np.array([x for x in X]).reshape(-1, 1) # bins = np.repeat(n_bins, X.shape[1]) # e.g. [5,3] for 2 features # encode to integers # quantile: each bin contains approx. the same number of features strategy = 'uniform' if util.data.is_int( data[self.k]) else 'quantile' self.est = preprocessing.KBinsDiscretizer( n_bins=self.n_bins, encode='onehot', strategy=strategy) self.est.fit(data[self.k].values.reshape(-1, 1)) self.n_bins = self.est.bin_edges_[0].size
def test_Transformer(nasdaq): nasdaq = apply_robust(nasdaq) new_nasdaq = [] for timeseries in nasdaq: if not (timeseries[-1] > 0.9999 or timeseries[-1] < 0.0001): new_nasdaq.append(timeseries) nasdaq = np.array(new_nasdaq) X, Y = nasdaq[:, :-1], nasdaq[:, -2:] discretizer = preprocessing.KBinsDiscretizer(n_bins=63, encode='ordinal') discretizer.fit(X) X = discretizer.transform(X) Y = np.diff(Y, axis=1) Y = np.sign(Y) Y = np.where(Y <= 0, Y * 0, Y) print(Y.shape) base_winrate = np.sum(Y) / len(Y) print(base_winrate) train_size = int(0.8 * len(X)) trainX, trainY = X[:train_size], Y[:train_size] validX, validY = X[train_size:], Y[train_size:] print(trainX.shape, trainY.shape) learning_rate = CustomSchedule(d_model) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True) model = Transformer_Encoder(num_layers=6, d_model=64, num_heads=8, dff=128, input_vocab_size=64, target_vocab_size=1, rate=dropout_rate) checkpoint_path = "./SavedModel/Transformer/Transformer_FXTM.h5" checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_weights_only=True, save_best_only=True) model.compile(optimizer=optimizer, loss=loss_object, metrics=['accuracy']) model.build(input_shape=(None, 64)) history = model.fit(trainX, trainY, validation_data=(validX, validY), batch_size=256, epochs=EPOCHS, callbacks=[checkpoint])
def discretize(self): disc = preprocessing.KBinsDiscretizer( n_bins= 10, # Watch here: you might have to tweak this parameter a bit encode= "ordinal", # I'm currently looking for a way to make this process automatic strategy="uniform" ) # If the data is standardized and rescaled though, 10 might be a good continous_df = self.df[self.continous_vars] # first attempt not_continous_df = self.df.drop(columns=self.continous_vars) continous_df = pd.DataFrame(disc.fit_transform(continous_df), columns=continous_df.columns) self.df = continous_df.join(not_continous_df) return self
def discritizeData(self, features): """ Discritization des données Args: features: Array de données Return: xKBinsDiscretizer: Array de données discritizé """ est = preprocessing.KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform') est.fit(features) xKBinsDiscretizer = est.transform(features) return xKBinsDiscretizer
def predict(self, x_test): predict = list() bins = [len(self.class_)] * len(x_test[0]) bins = preprocessing.KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='uniform').fit(x_test).transform(x_test) for element in bins[:,self.predict_table_index]: if element in self.predict_table: predict.append(self.predict_table[element]) else: predict.append(choice(list(self.predict_table.items()))[1]) self.predict_table = self.gen_table_rules(self.original_bin) return predict
def discretiza_con_kmeans(dataframe, n): ''' Recibe un dataframe y un numero de intervalos en el que queramos dividir dicho dataframe. Devuelve el dataframe dividido en n intervalos por la tecnica de kmeans. ''' discretizado = preprocessing.KBinsDiscretizer( n_bins=n, encode='ordinal', strategy="uniform").fit_transform(dataframe) datas = pd.DataFrame(discretizado) datas.index = dataframe.index datas.columns = dataframe.columns return datas
def predict(self, x_test): predict = [] # Discretizando bins = preprocessing.KBinsDiscretizer( n_bins=([len(self.class_)] * len(x_test[0])), encode='ordinal', strategy='uniform').fit(x_test).transform(x_test) x_test = bins[:, self.index] for i in range(0, len(x_test)): predict.append(self.rule_set[x_test[i]]) return predict
def kbins_discretizer(one_feature, n_bins=3, strategy='uniform'): ''' strategy uniform All bins in each feature have identical widths. quantile All bins in each feature have the same number of points. kmeans Values in each bin have the same nearest center of a 1D k-means cluster. Method used to encode the transformed result. ''' kbd = preprocessing.KBinsDiscretizer(n_bins=[n_bins], encode='ordinal', strategy=strategy) feature_transformed = kbd.fit_transform(one_feature).reshape(-1,) print(kbd.n_bins_) print(kbd.bin_edges_) return feature_transformed
def k_bins(data, n_bins, encoder = "ordinal", strategy = "quantile"): """ 分箱离散化 * encode: - "ordinal" - "onehot" - "onehot-dense" * strategy: - "uniform" - "quantile" - "kmeans" """ est = preprocessing.KBinsDiscretizer(n_bins = n_bins, encoder = encoder, strategy = strategy) transformed_data = est.fit_transform(data) return transformed_data
def descretization(self): try: for i in range(len(self.x)): temp = self.dataset[self.x[i]].values.reshape(-1, 1) est = preprocessing.KBinsDiscretizer(n_bins=5, encode='ordinal',strategy='uniform') est.fit(temp) x_scaled=est.transform(temp) print(x_scaled) self.dataset[self.x[i]] = x_scaled heads=self.dataset.head(10).to_numpy() for i in range(10): for j in range(len(self.dataset.columns)): self.table.setItem(i,j, QTableWidgetItem(str(heads[i][j]))) except Exception as e: print(repr(e))
def fit(self, x_train, y_train): discretizer = preprocessing.KBinsDiscretizer( n_bins=2 * len(unique_labels(y_train)), encode='ordinal', strategy='kmeans') discretizer.fit(x_train) self.discretizer = discretizer df_data = (pd.DataFrame(data=discretizer.transform(x_train))) df_data['classe'] = y_train contingency_df = {} best_score = float('-inf') best_feature = None for col in list(df_data.iloc[:, :-1]): contingency_df[col] = pd.crosstab(df_data['classe'], df_data[col]) score_feature = contingency_df[col].agg('max').sum() if (score_feature > best_score): best_feature = col self.best_feature = best_feature self.prob_table = contingency_df[col].apply(lambda x: x / sum(x)).T