示例#1
0
def task3(file_path):
    binning3 = preprocessing.KBinsDiscretizer(n_bins=3)
    binning6 = preprocessing.KBinsDiscretizer(n_bins=6)
    binning9 = preprocessing.KBinsDiscretizer(n_bins=9)

    data = pd.read_csv(
        file_path,
        usecols=(
            7,  # loan - classifier
            16,  # bank_arg1
            18,  # y - classifier
        ))
    labels = data.columns.values
    timer = TicToc()

    # Data clean-up
    data = data_transform(data)

    print_table(data.head())

    X_data = (data.drop("bank_arg1", 1))
    Y_data = (data["bank_arg1"])

    # Training/testing sets
    X_train = X_data[:-2500]
    X_test = X_data[-2500:]
    Y_train = Y_data[:-2500]
    Y_test = Y_data[-2500:]

    timer.tic()
    binning3.fit(X_data)
    prediction3 = binning3.transform(X_data)
    timer.toc()

    timer.tic()
    binning6.fit(X_data)
    prediction6 = binning6.transform(X_data)
    timer.toc()

    timer.tic()
    binning9.fit(X_data)
    prediction9 = binning9.transform(X_data)
    timer.toc()

    # TODO: Fix evaluation for matrix
    acc3 = evaluate_result(prediction3, Y_data.values)
    acc6 = evaluate_result(prediction6, Y_data.values)
    acc9 = evaluate_result(prediction9, Y_data.values)

    print(f"Binning with 3 units: {acc3}% matched in {timer.elapsed}s")
    print(f"Binning with 6 units: {acc6}% matched in {timer.elapsed}s")
    print(f"Binning with 9 units: {acc9}% matched in {timer.elapsed}s")
示例#2
0
    def preprocess(self, file_name):
        """
        Preprocesses the data for the Random Forest so that it is properly formatted for
        our training model.
        """

        names = [
            "age", "wrk_cls", "edu", "edu_num", "marital_sts", "occu_code",
            "relation", "race", "sex", "gain", "loss", "hours", "country",
            "income"
        ]

        original_data = pd.read_csv(file_name,
                                    dtype=object,
                                    names=names,
                                    skipinitialspace=True)
        original_data = pd.DataFrame(original_data)

        imp = SimpleImputer(missing_values='?',
                            strategy='most_frequent',
                            verbose=2)
        data = imp.fit_transform(original_data)

        enc = preprocessing.OrdinalEncoder()
        data = enc.fit_transform(data)

        est = preprocessing.KBinsDiscretizer(n_bins=8, encode="ordinal")
        data = est.fit_transform(data)
        return data
示例#3
0
    def kbinData(self):
        print("kbinData")
        training_data = []
        with open('training_data_set.txt', 'rb') as fp:
            training_set = pickle.load(fp)
        npa = np.empty([0, 70], dtype=int)
        ts_del_list = []
        for ts in training_set:
            try:
                npa = np.append(npa, np.array(training_set[ts]["d"]), axis=0)
            except:
                ts_del_list.append(ts)
                continue
        for tdl in ts_del_list:
            del training_set[tdl]
        est = []

        for n in npa.transpose():
            est.append(
                preprocessing.KBinsDiscretizer(n_bins=10,
                                               encode='ordinal').fit_transform(
                                                   n.reshape(-1, 1)))

        np_est = np.array(est).transpose()[0]
        self.kbinChart(np_est)
        i = 0
        for ts in training_set:
            training_set[ts]["kbinD"] = np_est[i]
            i += 1
        with open('training_data_set_kbin', 'wb') as fp:
            pickle.dump(training_set, fp, protocol=pickle.HIGHEST_PROTOCOL)
        print("done")
 def __init__(self, column, dataframe, settings):
     AEncoder.__init__(self, column, dataframe, settings)
     self.encoder = preprocessing.KBinsDiscretizer(
         n_bins=self.settings.get('n_bins', 5),
         encode=self.settings.get('encode', 'onehot'),
         strategy=self.settings.get('strategy', 'quantile'))
     self.pickle_process(dataframe)
示例#5
0
 def execute(self, df: data.Frame) -> data.Frame:
     frame = copy.deepcopy(df)
     f = frame.getRawFrame()
     # Operation ignores nan values
     nanRows = f.iloc[:, list(self.__attributes.keys())].isnull()
     # For every column, transform every non-nan row
     columns = f.columns
     edges: Dict[int, List[float]] = dict()
     for col, k in self.__attributes.items():
         colName = columns[col]
         notNa = (~nanRows.loc[:, colName]).to_list()
         discretizer = skp.KBinsDiscretizer(n_bins=k, encode='ordinal',
                                            strategy=self.__strategy.value)
         # Discretize and convert to string (since categories are strings)
         result = discretizer.fit_transform(f.loc[notNa, colName].values.reshape(-1, 1)).astype(str)
         name: str = colName
         if self.__attributeSuffix:
             # Make a new column with all nans
             name = colName + self.__attributeSuffix
             f.loc[:, name] = np.nan
         # Assign column
         f.loc[notNa, [name]] = result
         f.loc[:, name] = f[name].astype(
             pd.CategoricalDtype(categories=[str(float(i)) for i in range(k)], ordered=True))
         edges[col] = discretizer.bin_edges_[0].tolist()
     # Log what has been done
     self.__logExecution(columns, edges)
     return data.Frame(f)
示例#6
0
 def __init__(
     self,
     *,
     hyperparams: Hyperparams,
     random_seed: int = 0,
     docker_containers: typing.Union[typing.Dict[
         str, base.DockerContainer]] = None
 ) -> None:
     super().__init__(hyperparams=hyperparams,
                      random_seed=random_seed,
                      docker_containers=docker_containers)
     self._index = None
     self._training_inputs = None
     self._training_outputs = None
     self._origin_inputs = None  #for label encoder
     self._fitted = False
     self._cate_flag = None
     self._clf = Model('nb', bayesInf=1, PointInf=0)  #classifier
     self._LEoutput = preprocessing.LabelEncoder()  #label encoder
     self._Imputer = SimpleImputer(missing_values=np.nan,
                                   strategy='most_frequent')  #imputer
     self._nbins = 10
     self._Kbins = preprocessing.KBinsDiscretizer(
         n_bins=self._nbins, encode='ordinal',
         strategy='uniform')  #KbinsDiscretizer
     self._discTrainset = None
示例#7
0
 def __init__(
     self,
     *,
     hyperparams: Hyperparams,
     random_seed: int = 0,
     docker_containers: typing.Union[typing.Dict[
         str, base.DockerContainer]] = None
 ) -> None:
     super().__init__(hyperparams=hyperparams,
                      random_seed=random_seed,
                      docker_containers=docker_containers)
     self._index = None
     self._problem_type = 'classification'
     self._training_inputs = None
     self._training_outputs = None
     self._fitted = False
     self._cate_flag = None
     self._LEoutput = preprocessing.LabelEncoder()  # label encoder
     self._Imputer = SimpleImputer(
         missing_values=np.nan,
         strategy='mean')  #self.hyperparams['Imputer_Strategy']) # imputer
     self._nbins = self.hyperparams['nbins']
     self._Kbins = preprocessing.KBinsDiscretizer(
         n_bins=self._nbins, encode='ordinal', strategy='uniform'
     )  #self.hyperparams['Discretizer_Strategy']) #KbinsDiscretizer
示例#8
0
    def __init__(self,
                 data,
                 k=4,
                 leaf_size=40,
                 lambdav=.5,
                 lap_vec=None,
                 bins=None):

        self.tree = KDTree(data, leaf_size=leaf_size)
        self.data = data

        if bins is not None:
            self.is_binned = True
            self.discretizer = preprocessing.KBinsDiscretizer(
                n_bins=bins, encode='ordinal').fit(data)
            self.data_binned = self.discretizer.inverse_transform(
                self.discretizer.transform(data))
            self.bin_tree = KDTree(self.data_binned, leaf_size=leaf_size)

        self.k = k
        self.lambdav = lambdav

        self.knn_dist_arr, self.knn_idx_arr = self.knn(data)
        self._pdist = self.pdist(data, self.knn_dist_arr, lap_vec)

        self._plof, self.nplof = self.plof(data, self.knn_idx_arr, self._pdist)
        self.loop_values = self.loop(self._plof)
示例#9
0
def get_outside_feature(data):
    est = preprocessing.KBinsDiscretizer(n_bins=5, encode='onehot-dense')
    new = est.fit_transform(sourse[['收率']])
    new = pd.DataFrame(data=new)
    new_feature = [i for i in new.columns]
    sourse[new_feature] = new

    data = data.copy()
    len_first = len(data.columns)
    for i in data.columns:
        order_mean = sourse.groupby(i)['收率'].mean()
        order_max = sourse.groupby(i)['收率'].max()
        order_min = sourse.groupby(i)['收率'].min()
        order_std = sourse.groupby(i)['收率'].std().fillna(0)
        order_sum = sourse.groupby(i)['收率'].sum()
        order_median = sourse.groupby(i)['收率'].median()

        # data['new_'+i+'_mean'] = data[i].map(order_mean)
        # data['new_'+i+'_max'] = data[i].map(order_max)
        # data['new_'+i+'_min'] = data[i].map(order_min)
        # data['new_'+i+'_std'] = data[i].map(order_std)
        # data['new_'+i+'_sum'] = data[i].map(order_sum)
        # data['new_'+i+'_median'] = data[i].map(order_median)

        # for j in new_feature:
        # 	order_label = sourse.groupby(i)[j].count()
        # 	data['new_'+i+'_'+str(j)] = data[i].map(order_label)
    data = data.iloc[:, len_first:].copy()

    return data
    def preprocess(self, file_name):
        """
        Method to preprocess data to be in a format that is conducive
        to training.
        """
        names = ("age", "wrk_cls", "edu", "edu_num", "marital_sts",
                 "occu_code", "relation", "race", "sex", "gain", "loss",
                 "hours", "country", "income")

        original_data = pd.read_csv(file_name,
                                    dtype=object,
                                    names=names,
                                    skipinitialspace=True)
        original_data = pd.DataFrame(original_data)

        imp = SimpleImputer(missing_values='?',
                            strategy='most_frequent',
                            verbose=2)
        data = imp.fit_transform(original_data)

        enc = preprocessing.OrdinalEncoder()
        data = enc.fit_transform(data)

        est = preprocessing.KBinsDiscretizer(encode="ordinal")
        data = est.fit_transform(data)

        return data
 def __init__(
     self,
     *,
     hyperparams: Hyperparams,
     random_seed: int = 0,
     docker_containers: typing.Union[typing.Dict[
         str, base.DockerContainer]] = None
 ) -> None:
     super().__init__(hyperparams=hyperparams,
                      random_seed=random_seed,
                      docker_containers=docker_containers)
     #parameters
     self._index = None
     self._fitted = False
     #hyperparameters
     self._nbins = self.hyperparams['nbins']
     self._strategy = self.hyperparams['strategy']
     self._method = self.hyperparams['method']
     self._thres_search_method = self.hyperparams['thres_search_method']
     self._threshold = self.hyperparams['threshold']
     self._bayes_factors = self.hyperparams['bayesfactors']
     self._problem_type = self.hyperparams['problem_type']
     #other parameters
     self._training_inputs = None
     self._training_outputs = None
     self._cate_flag = None
     self._LEoutput = preprocessing.LabelEncoder()  # label encoder
     self._Imputer = SimpleImputer(missing_values=np.nan,
                                   strategy='most_frequent')  # imputer
     self._Kbins = preprocessing.KBinsDiscretizer(
         n_bins=self._nbins, encode='ordinal', strategy=self._strategy
     )  #self.hyperparams['Discretizer_Strategy']) #KbinsDiscretizer
示例#12
0
    def fit(self, x_train, y_train):
        self.class_  = unique_labels(y_train)
        self.x_train = x_train
        self.y_train = y_train

        self.candidates = list()
        self.predict_table = dict()
        self.predict_table_index = 0

        bins = [len(self.class_)] * len(self.x_train[0])
        bins = preprocessing.KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='uniform').fit(self.x_train).transform(self.x_train)

        iterations = len(self.x_train[0])

        for i in range(iterations):
            cross = pd.crosstab(bins[:,i], self.y_train)
            rules = self.gen_table_rules(cross)
            self.candidates.append(rules)

        self.predict_table_index = self.best_predict_table_index(bins, self.y_train, self.candidates)
        self.predict_table = self.candidates[self.predict_table_index]

        selected_column = x_train[:,self.predict_table_index]
        collection_of_classes = dict()
        self.rule_set = dict()

        for k in self.class_:
            collection_of_classes[k] = list()

        for index, element in enumerate(selected_column):
            collection_of_classes[y_train[index]].append(element)

        for key, value in collection_of_classes.items():
            self.rule_set[key] = sum(value) / len(value)
示例#13
0
    def fit(self, x_train, y_train):
        self.class_ = unique_labels(y_train)
        opt_acc = 0

        # Discretizando
        bins = preprocessing.KBinsDiscretizer(
            n_bins=([len(self.class_)] * len(x_train[0])),
            encode='ordinal',
            strategy='uniform').fit(x_train).transform(x_train)

        # Fazendo as tabelas
        for i in range(0, len(x_train[0])):
            table = pd.crosstab(bins[:, i],
                                y_train)  # Cria as tabelas de contingencia
            rule = self.gen_rule(table)  # Cria a regra a partir da tabela

            if (self.acc_train(rule, bins[:, i], y_train)) >= opt_acc:
                self.index = i

        # Achando o centroide
        selected_column = x_train[:, self.index]
        class_lists = dict()
        self.rule_set = dict()

        for val in self.class_:
            class_lists[val] = []

        for i in range(0, len(selected_column)):
            class_lists[y_train[i]].append(selected_column[i])

        for key, value in class_lists.items():
            self.rule_set[key] = sum(value) / len(value)
示例#14
0
def q2():
    #Criação de variável que representa intevalos, semelhante ao pd.cut
    discretizer = preprocessing.KBinsDiscretizer(n_bins=10,
                                                 encode='ordinal',
                                                 strategy='quantile')
    discretizer.fit(countries[['Pop_density']])
    return int(sum(score_bins[:, 0] >= 9))
示例#15
0
def to_categorial(data_pd_series, n_bins=10):
    bins_transform = data_pd_series.to_numpy().reshape(-1, 1)
    descritizer = preprocessing.KBinsDiscretizer(
        n_bins=n_bins).fit(bins_transform)
    tmp1 = pd.DataFrame(
        descritizer.transform(bins_transform).todense()).stack()
    return pd.Series(pd.Categorical(
        tmp1[tmp1 != 0].index.get_level_values(1))), descritizer.bin_edges_
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:  # inputs: m x n numpy array
        if self._fitted:
            # get discrete bins from training data
            discTrainset = RelationSet(self._training_inputs, self._training_outputs.reshape(-1, 1))
            discTrainset.impute()
            discTrainset.discretize()
            discTrainset.remove()
            bins = discTrainset.NUM_STATES
            
            # convert categorical values to numerical values in testing data
            metadata = inputs.metadata
            [m, n] = inputs.shape
            X_test = np.zeros((m, n))
            for column_index in metadata.get_elements((metadata_base.ALL_ELEMENTS,)):
                if column_index is metadata_base.ALL_ELEMENTS:
                    continue
                column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
                semantic_types = column_metadata.get('semantic_types', [])
                if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types:
                    LE = preprocessing.LabelEncoder()
                    LE = LE.fit(inputs.iloc[:, column_index])
                    X_test[:, column_index] = LE.transform(inputs.iloc[:, column_index])
                elif 'http://schema.org/Text' in semantic_types:
                    pass
                else:
                    temp = list(inputs.iloc[:, column_index].values)
                    for i in np.arange(len(temp)):
                        if bool(temp[i]):
                            X_test[i, column_index] = float(temp[i])
                        else:
                            X_test[i, column_index] = 'nan'
            discTestset = RelationSet(X_test, [])
            discTestset.impute()
            X_test = discTestset.data
            index_list = np.setdiff1d(np.arange(discTrainset.num_features),np.array(discTrainset.removeIdx))
            X_test = X_test[:, index_list]
            est = preprocessing.KBinsDiscretizer(n_bins=bins,encode='ordinal',strategy='uniform')
            est.fit(X_test)
            X_test = est.transform(X_test)
            t = time.time()
            output = self._clf.predict(X_test)
            self._time = time.time() - t + self._time
            if min(self._training_outputs) == 1:
                output = output + 1
            # label decode
            output = self._LEoutput.inverse_transform(output)
            
            # update metadata
            output = container.DataFrame(output, generate_metadata=False, source=self)
            output.metadata = inputs.metadata.clear(source=self, for_value=output, generate_metadata=True)
            
            for column_index, column_metadata in enumerate(self._target_columns_metadata):
                output.metadata = output.metadata.update_column(column_index, column_metadata, source=self)


            return CallResult(output)
        else:
            raise ValueError('Model should be fitted first.')
示例#17
0
def q2():
    est = preprocessing.KBinsDiscretizer(n_bins=10,
                                         encode='ordinal',
                                         strategy='quantile')
    pop_density_10bins = est.fit_transform(countries[['Pop_density']])
    plt.hist(pop_density_10bins)
    above_90 = np.sum(pop_density_10bins >= 9)
    # (pop_density_10bins > np.percentile(pop_density_10bins, 90)).sum()
    return above_90
    def __init__(self, X_train, y_train):
        """
        Creates a new SalaryPredictor trained on the given features from the
        preprocessed census data to predicted salary labels. Performs and fits
        any preprocessing methods (e.g., imputing of missing features,
        discretization of continuous variables, etc.) on the inputs, and saves
        these as attributes to later transform test inputs.
        
        :param DataFrame X_train: Pandas DataFrame consisting of the
        sample rows of attributes pertaining to each individual
        :param DataFrame y_train: Pandas DataFrame consisting of the
        sample rows of labels pertaining to each person's salary
        """
        # [!] TODO
        ####
        #defined columns
        columns_to_encode=["work_class","education","marital","occupation_code","relationship","race","sex","country"]
        #columns_to_scale=["age","education_years","capital_gain","capital_loss","hours_per_week"]
        columns_to_scale=["age","education_years","hours_per_week"]
        columns_to_leave= ["capital_gain","capital_loss"]
        self.columns_with_missing=["work_class","occupation_code","country"]
        #get the categories for each column
        cat = []
        for col in X_train.columns:
            if pd.api.types.is_string_dtype(X_train[col]):
                cat.append(X_train[col].unique())

        #print(cat)
        self.imp = SimpleImputer(missing_values = "?", strategy="most_frequent")
        imp_columns = self.imp.fit_transform(X_train[columns_to_encode])
        
        self.ohe = preprocessing.OneHotEncoder(  handle_unknown = "ignore",categories = cat,sparse = False)
        self.le2 = preprocessing.LabelEncoder()
        self.Kbin = preprocessing.KBinsDiscretizer(n_bins=[2,2,2],encode="ordinal")
        self.scale = preprocessing.StandardScaler()
        #scaled_columns = self.scale.fit_transform(X_train[columns_to_scale])
        #ds_columns = self.Kbin.fit_transform(scaled_columns)
        scaled_columns = self.scale.fit_transform(X_train[columns_to_leave])
        ds_columns = self.Kbin.fit_transform(X_train[columns_to_scale])
        #print(X_train)
        encoded_columns = self.ohe.fit_transform(imp_columns)
        #print(X_train)
        #print(encoded_columns)
        #for col in encoded_columns:
            #print(col)
        #   if col == "work_class" or col == "occupation_code" or col == "country":
        #        encoded_columns[col].replace([0],-1)
        processed_data = np.concatenate([ds_columns,encoded_columns,scaled_columns],axis=1)
        #self.imp = SimpleImputer(missing_values = -1, strategy="most_frequent")
        #X_train = self.imp.fit_transform(processed_data)
        self.le = preprocessing.LabelEncoder()
        #y_process = self.le.fit_transform(y_train)
        #print(y_process)
        self.clf = LogisticRegression(max_iter=1000).fit(processed_data,y_train)
示例#19
0
    def predict(self, x_test):
        predict = list()

        bins = [len(self.class_)] * len(x_test[0])
        bins = preprocessing.KBinsDiscretizer(
            n_bins=bins, encode='ordinal',
            strategy='uniform').fit(x_test).transform(x_test)

        for element in bins[:, self.predict_table_index]:
            predict.append(self.predict_table[element])

        return predict
示例#20
0
 def fit(self, data):
     # row = data[self.k].values
     # X = np.array([x for x in X]).reshape(-1, 1)
     # bins = np.repeat(n_bins, X.shape[1])  # e.g. [5,3] for 2 features
     # encode to integers
     # quantile: each bin contains approx. the same number of features
     strategy = 'uniform' if util.data.is_int(
         data[self.k]) else 'quantile'
     self.est = preprocessing.KBinsDiscretizer(
         n_bins=self.n_bins, encode='onehot', strategy=strategy)
     self.est.fit(data[self.k].values.reshape(-1, 1))
     self.n_bins = self.est.bin_edges_[0].size
示例#21
0
def test_Transformer(nasdaq):
    nasdaq = apply_robust(nasdaq)
    new_nasdaq = []
    for timeseries in nasdaq:
        if not (timeseries[-1] > 0.9999 or timeseries[-1] < 0.0001):
            new_nasdaq.append(timeseries)
    nasdaq = np.array(new_nasdaq)

    X, Y = nasdaq[:, :-1], nasdaq[:, -2:]

    discretizer = preprocessing.KBinsDiscretizer(n_bins=63, encode='ordinal')
    discretizer.fit(X)
    X = discretizer.transform(X)

    Y = np.diff(Y, axis=1)
    Y = np.sign(Y)
    Y = np.where(Y <= 0, Y * 0, Y)

    print(Y.shape)
    base_winrate = np.sum(Y) / len(Y)
    print(base_winrate)

    train_size = int(0.8 * len(X))
    trainX, trainY = X[:train_size], Y[:train_size]
    validX, validY = X[train_size:], Y[train_size:]
    print(trainX.shape, trainY.shape)

    learning_rate = CustomSchedule(d_model)
    optimizer = tf.keras.optimizers.Adam(learning_rate,
                                         beta_1=0.9,
                                         beta_2=0.98,
                                         epsilon=1e-9)
    loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)

    model = Transformer_Encoder(num_layers=6,
                                d_model=64,
                                num_heads=8,
                                dff=128,
                                input_vocab_size=64,
                                target_vocab_size=1,
                                rate=dropout_rate)
    checkpoint_path = "./SavedModel/Transformer/Transformer_FXTM.h5"
    checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                    save_weights_only=True,
                                                    save_best_only=True)
    model.compile(optimizer=optimizer, loss=loss_object, metrics=['accuracy'])
    model.build(input_shape=(None, 64))
    history = model.fit(trainX,
                        trainY,
                        validation_data=(validX, validY),
                        batch_size=256,
                        epochs=EPOCHS,
                        callbacks=[checkpoint])
示例#22
0
 def discretize(self):
     disc = preprocessing.KBinsDiscretizer(
         n_bins=
         10,  # Watch here: you might have to tweak this parameter a bit
         encode=
         "ordinal",  # I'm currently looking for a way to make this process automatic
         strategy="uniform"
     )  # If the data is standardized and rescaled though, 10 might be a good
     continous_df = self.df[self.continous_vars]  # first attempt
     not_continous_df = self.df.drop(columns=self.continous_vars)
     continous_df = pd.DataFrame(disc.fit_transform(continous_df),
                                 columns=continous_df.columns)
     self.df = continous_df.join(not_continous_df)
     return self
示例#23
0
 def discritizeData(self, features):
     """
     Discritization des données
     Args:
         features:           Array de données
     Return:
         xKBinsDiscretizer:  Array de données discritizé
     """
     est = preprocessing.KBinsDiscretizer(n_bins=3,
                                          encode='ordinal',
                                          strategy='uniform')
     est.fit(features)
     xKBinsDiscretizer = est.transform(features)
     return xKBinsDiscretizer
示例#24
0
    def predict(self, x_test):
        predict = list()

        bins = [len(self.class_)] * len(x_test[0])
        bins = preprocessing.KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='uniform').fit(x_test).transform(x_test)

        for element in bins[:,self.predict_table_index]:
            if element in self.predict_table:
                predict.append(self.predict_table[element])
            else:
                predict.append(choice(list(self.predict_table.items()))[1])

            self.predict_table = self.gen_table_rules(self.original_bin)

        return predict
def discretiza_con_kmeans(dataframe, n):
    '''
    Recibe un dataframe y un numero de intervalos en el que queramos dividir
    dicho dataframe.
    
    Devuelve el dataframe dividido en n intervalos por la tecnica de kmeans.
    '''

    discretizado = preprocessing.KBinsDiscretizer(
        n_bins=n, encode='ordinal',
        strategy="uniform").fit_transform(dataframe)
    datas = pd.DataFrame(discretizado)
    datas.index = dataframe.index
    datas.columns = dataframe.columns
    return datas
示例#26
0
    def predict(self, x_test):
        predict = []

        # Discretizando
        bins = preprocessing.KBinsDiscretizer(
            n_bins=([len(self.class_)] * len(x_test[0])),
            encode='ordinal',
            strategy='uniform').fit(x_test).transform(x_test)

        x_test = bins[:, self.index]

        for i in range(0, len(x_test)):
            predict.append(self.rule_set[x_test[i]])

        return predict
示例#27
0
 def kbins_discretizer(one_feature, n_bins=3, strategy='uniform'):
     '''
     strategy
         uniform
         All bins in each feature have identical widths.
         quantile
         All bins in each feature have the same number of points.
         kmeans
         Values in each bin have the same nearest center of a 1D k-means cluster.
         Method used to encode the transformed result.
     '''
     kbd = preprocessing.KBinsDiscretizer(n_bins=[n_bins], encode='ordinal', strategy=strategy)
     feature_transformed = kbd.fit_transform(one_feature).reshape(-1,)
     print(kbd.n_bins_)
     print(kbd.bin_edges_)
     return feature_transformed
示例#28
0
def k_bins(data, n_bins, encoder = "ordinal", strategy = "quantile"):
    """
    分箱离散化
    * encode:
        - "ordinal"
        - "onehot"
        - "onehot-dense"
    * strategy:
        - "uniform"
        - "quantile"
        - "kmeans"
    """
    est = preprocessing.KBinsDiscretizer(n_bins = n_bins, encoder = encoder, strategy = strategy)
    transformed_data = est.fit_transform(data)

    return transformed_data
示例#29
0
    def descretization(self):
        try:
            for i in range(len(self.x)):
                temp = self.dataset[self.x[i]].values.reshape(-1, 1)
                est  = preprocessing.KBinsDiscretizer(n_bins=5, encode='ordinal',strategy='uniform')
                est.fit(temp)
                x_scaled=est.transform(temp)
                print(x_scaled)
                self.dataset[self.x[i]] = x_scaled


            heads=self.dataset.head(10).to_numpy()
        
            for i in range(10):
                for j in range(len(self.dataset.columns)):
                    self.table.setItem(i,j, QTableWidgetItem(str(heads[i][j])))

        except Exception as e:
            print(repr(e))  
示例#30
0
 def fit(self, x_train, y_train):
     discretizer = preprocessing.KBinsDiscretizer(
         n_bins=2 * len(unique_labels(y_train)),
         encode='ordinal',
         strategy='kmeans')
     discretizer.fit(x_train)
     self.discretizer = discretizer
     df_data = (pd.DataFrame(data=discretizer.transform(x_train)))
     df_data['classe'] = y_train
     contingency_df = {}
     best_score = float('-inf')
     best_feature = None
     for col in list(df_data.iloc[:, :-1]):
         contingency_df[col] = pd.crosstab(df_data['classe'], df_data[col])
         score_feature = contingency_df[col].agg('max').sum()
         if (score_feature > best_score):
             best_feature = col
     self.best_feature = best_feature
     self.prob_table = contingency_df[col].apply(lambda x: x / sum(x)).T