Пример #1
0
def task3(file_path):
    binning3 = preprocessing.KBinsDiscretizer(n_bins=3)
    binning6 = preprocessing.KBinsDiscretizer(n_bins=6)
    binning9 = preprocessing.KBinsDiscretizer(n_bins=9)

    data = pd.read_csv(
        file_path,
        usecols=(
            7,  # loan - classifier
            16,  # bank_arg1
            18,  # y - classifier
        ))
    labels = data.columns.values
    timer = TicToc()

    # Data clean-up
    data = data_transform(data)

    print_table(data.head())

    X_data = (data.drop("bank_arg1", 1))
    Y_data = (data["bank_arg1"])

    # Training/testing sets
    X_train = X_data[:-2500]
    X_test = X_data[-2500:]
    Y_train = Y_data[:-2500]
    Y_test = Y_data[-2500:]

    timer.tic()
    binning3.fit(X_data)
    prediction3 = binning3.transform(X_data)
    timer.toc()

    timer.tic()
    binning6.fit(X_data)
    prediction6 = binning6.transform(X_data)
    timer.toc()

    timer.tic()
    binning9.fit(X_data)
    prediction9 = binning9.transform(X_data)
    timer.toc()

    # TODO: Fix evaluation for matrix
    acc3 = evaluate_result(prediction3, Y_data.values)
    acc6 = evaluate_result(prediction6, Y_data.values)
    acc9 = evaluate_result(prediction9, Y_data.values)

    print(f"Binning with 3 units: {acc3}% matched in {timer.elapsed}s")
    print(f"Binning with 6 units: {acc6}% matched in {timer.elapsed}s")
    print(f"Binning with 9 units: {acc9}% matched in {timer.elapsed}s")
Пример #2
0
    def preprocess(self, file_name):
        """
        Preprocesses the data for the Random Forest so that it is properly formatted for
        our training model.
        """

        names = [
            "age", "wrk_cls", "edu", "edu_num", "marital_sts", "occu_code",
            "relation", "race", "sex", "gain", "loss", "hours", "country",
            "income"
        ]

        original_data = pd.read_csv(file_name,
                                    dtype=object,
                                    names=names,
                                    skipinitialspace=True)
        original_data = pd.DataFrame(original_data)

        imp = SimpleImputer(missing_values='?',
                            strategy='most_frequent',
                            verbose=2)
        data = imp.fit_transform(original_data)

        enc = preprocessing.OrdinalEncoder()
        data = enc.fit_transform(data)

        est = preprocessing.KBinsDiscretizer(n_bins=8, encode="ordinal")
        data = est.fit_transform(data)
        return data
Пример #3
0
    def kbinData(self):
        print("kbinData")
        training_data = []
        with open('training_data_set.txt', 'rb') as fp:
            training_set = pickle.load(fp)
        npa = np.empty([0, 70], dtype=int)
        ts_del_list = []
        for ts in training_set:
            try:
                npa = np.append(npa, np.array(training_set[ts]["d"]), axis=0)
            except:
                ts_del_list.append(ts)
                continue
        for tdl in ts_del_list:
            del training_set[tdl]
        est = []

        for n in npa.transpose():
            est.append(
                preprocessing.KBinsDiscretizer(n_bins=10,
                                               encode='ordinal').fit_transform(
                                                   n.reshape(-1, 1)))

        np_est = np.array(est).transpose()[0]
        self.kbinChart(np_est)
        i = 0
        for ts in training_set:
            training_set[ts]["kbinD"] = np_est[i]
            i += 1
        with open('training_data_set_kbin', 'wb') as fp:
            pickle.dump(training_set, fp, protocol=pickle.HIGHEST_PROTOCOL)
        print("done")
 def __init__(self, column, dataframe, settings):
     AEncoder.__init__(self, column, dataframe, settings)
     self.encoder = preprocessing.KBinsDiscretizer(
         n_bins=self.settings.get('n_bins', 5),
         encode=self.settings.get('encode', 'onehot'),
         strategy=self.settings.get('strategy', 'quantile'))
     self.pickle_process(dataframe)
Пример #5
0
 def execute(self, df: data.Frame) -> data.Frame:
     frame = copy.deepcopy(df)
     f = frame.getRawFrame()
     # Operation ignores nan values
     nanRows = f.iloc[:, list(self.__attributes.keys())].isnull()
     # For every column, transform every non-nan row
     columns = f.columns
     edges: Dict[int, List[float]] = dict()
     for col, k in self.__attributes.items():
         colName = columns[col]
         notNa = (~nanRows.loc[:, colName]).to_list()
         discretizer = skp.KBinsDiscretizer(n_bins=k, encode='ordinal',
                                            strategy=self.__strategy.value)
         # Discretize and convert to string (since categories are strings)
         result = discretizer.fit_transform(f.loc[notNa, colName].values.reshape(-1, 1)).astype(str)
         name: str = colName
         if self.__attributeSuffix:
             # Make a new column with all nans
             name = colName + self.__attributeSuffix
             f.loc[:, name] = np.nan
         # Assign column
         f.loc[notNa, [name]] = result
         f.loc[:, name] = f[name].astype(
             pd.CategoricalDtype(categories=[str(float(i)) for i in range(k)], ordered=True))
         edges[col] = discretizer.bin_edges_[0].tolist()
     # Log what has been done
     self.__logExecution(columns, edges)
     return data.Frame(f)
Пример #6
0
 def __init__(
     self,
     *,
     hyperparams: Hyperparams,
     random_seed: int = 0,
     docker_containers: typing.Union[typing.Dict[
         str, base.DockerContainer]] = None
 ) -> None:
     super().__init__(hyperparams=hyperparams,
                      random_seed=random_seed,
                      docker_containers=docker_containers)
     self._index = None
     self._training_inputs = None
     self._training_outputs = None
     self._origin_inputs = None  #for label encoder
     self._fitted = False
     self._cate_flag = None
     self._clf = Model('nb', bayesInf=1, PointInf=0)  #classifier
     self._LEoutput = preprocessing.LabelEncoder()  #label encoder
     self._Imputer = SimpleImputer(missing_values=np.nan,
                                   strategy='most_frequent')  #imputer
     self._nbins = 10
     self._Kbins = preprocessing.KBinsDiscretizer(
         n_bins=self._nbins, encode='ordinal',
         strategy='uniform')  #KbinsDiscretizer
     self._discTrainset = None
Пример #7
0
 def __init__(
     self,
     *,
     hyperparams: Hyperparams,
     random_seed: int = 0,
     docker_containers: typing.Union[typing.Dict[
         str, base.DockerContainer]] = None
 ) -> None:
     super().__init__(hyperparams=hyperparams,
                      random_seed=random_seed,
                      docker_containers=docker_containers)
     self._index = None
     self._problem_type = 'classification'
     self._training_inputs = None
     self._training_outputs = None
     self._fitted = False
     self._cate_flag = None
     self._LEoutput = preprocessing.LabelEncoder()  # label encoder
     self._Imputer = SimpleImputer(
         missing_values=np.nan,
         strategy='mean')  #self.hyperparams['Imputer_Strategy']) # imputer
     self._nbins = self.hyperparams['nbins']
     self._Kbins = preprocessing.KBinsDiscretizer(
         n_bins=self._nbins, encode='ordinal', strategy='uniform'
     )  #self.hyperparams['Discretizer_Strategy']) #KbinsDiscretizer
Пример #8
0
    def __init__(self,
                 data,
                 k=4,
                 leaf_size=40,
                 lambdav=.5,
                 lap_vec=None,
                 bins=None):

        self.tree = KDTree(data, leaf_size=leaf_size)
        self.data = data

        if bins is not None:
            self.is_binned = True
            self.discretizer = preprocessing.KBinsDiscretizer(
                n_bins=bins, encode='ordinal').fit(data)
            self.data_binned = self.discretizer.inverse_transform(
                self.discretizer.transform(data))
            self.bin_tree = KDTree(self.data_binned, leaf_size=leaf_size)

        self.k = k
        self.lambdav = lambdav

        self.knn_dist_arr, self.knn_idx_arr = self.knn(data)
        self._pdist = self.pdist(data, self.knn_dist_arr, lap_vec)

        self._plof, self.nplof = self.plof(data, self.knn_idx_arr, self._pdist)
        self.loop_values = self.loop(self._plof)
Пример #9
0
def get_outside_feature(data):
    est = preprocessing.KBinsDiscretizer(n_bins=5, encode='onehot-dense')
    new = est.fit_transform(sourse[['收率']])
    new = pd.DataFrame(data=new)
    new_feature = [i for i in new.columns]
    sourse[new_feature] = new

    data = data.copy()
    len_first = len(data.columns)
    for i in data.columns:
        order_mean = sourse.groupby(i)['收率'].mean()
        order_max = sourse.groupby(i)['收率'].max()
        order_min = sourse.groupby(i)['收率'].min()
        order_std = sourse.groupby(i)['收率'].std().fillna(0)
        order_sum = sourse.groupby(i)['收率'].sum()
        order_median = sourse.groupby(i)['收率'].median()

        # data['new_'+i+'_mean'] = data[i].map(order_mean)
        # data['new_'+i+'_max'] = data[i].map(order_max)
        # data['new_'+i+'_min'] = data[i].map(order_min)
        # data['new_'+i+'_std'] = data[i].map(order_std)
        # data['new_'+i+'_sum'] = data[i].map(order_sum)
        # data['new_'+i+'_median'] = data[i].map(order_median)

        # for j in new_feature:
        # 	order_label = sourse.groupby(i)[j].count()
        # 	data['new_'+i+'_'+str(j)] = data[i].map(order_label)
    data = data.iloc[:, len_first:].copy()

    return data
Пример #10
0
    def preprocess(self, file_name):
        """
        Method to preprocess data to be in a format that is conducive
        to training.
        """
        names = ("age", "wrk_cls", "edu", "edu_num", "marital_sts",
                 "occu_code", "relation", "race", "sex", "gain", "loss",
                 "hours", "country", "income")

        original_data = pd.read_csv(file_name,
                                    dtype=object,
                                    names=names,
                                    skipinitialspace=True)
        original_data = pd.DataFrame(original_data)

        imp = SimpleImputer(missing_values='?',
                            strategy='most_frequent',
                            verbose=2)
        data = imp.fit_transform(original_data)

        enc = preprocessing.OrdinalEncoder()
        data = enc.fit_transform(data)

        est = preprocessing.KBinsDiscretizer(encode="ordinal")
        data = est.fit_transform(data)

        return data
Пример #11
0
 def __init__(
     self,
     *,
     hyperparams: Hyperparams,
     random_seed: int = 0,
     docker_containers: typing.Union[typing.Dict[
         str, base.DockerContainer]] = None
 ) -> None:
     super().__init__(hyperparams=hyperparams,
                      random_seed=random_seed,
                      docker_containers=docker_containers)
     #parameters
     self._index = None
     self._fitted = False
     #hyperparameters
     self._nbins = self.hyperparams['nbins']
     self._strategy = self.hyperparams['strategy']
     self._method = self.hyperparams['method']
     self._thres_search_method = self.hyperparams['thres_search_method']
     self._threshold = self.hyperparams['threshold']
     self._bayes_factors = self.hyperparams['bayesfactors']
     self._problem_type = self.hyperparams['problem_type']
     #other parameters
     self._training_inputs = None
     self._training_outputs = None
     self._cate_flag = None
     self._LEoutput = preprocessing.LabelEncoder()  # label encoder
     self._Imputer = SimpleImputer(missing_values=np.nan,
                                   strategy='most_frequent')  # imputer
     self._Kbins = preprocessing.KBinsDiscretizer(
         n_bins=self._nbins, encode='ordinal', strategy=self._strategy
     )  #self.hyperparams['Discretizer_Strategy']) #KbinsDiscretizer
Пример #12
0
    def fit(self, x_train, y_train):
        self.class_  = unique_labels(y_train)
        self.x_train = x_train
        self.y_train = y_train

        self.candidates = list()
        self.predict_table = dict()
        self.predict_table_index = 0

        bins = [len(self.class_)] * len(self.x_train[0])
        bins = preprocessing.KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='uniform').fit(self.x_train).transform(self.x_train)

        iterations = len(self.x_train[0])

        for i in range(iterations):
            cross = pd.crosstab(bins[:,i], self.y_train)
            rules = self.gen_table_rules(cross)
            self.candidates.append(rules)

        self.predict_table_index = self.best_predict_table_index(bins, self.y_train, self.candidates)
        self.predict_table = self.candidates[self.predict_table_index]

        selected_column = x_train[:,self.predict_table_index]
        collection_of_classes = dict()
        self.rule_set = dict()

        for k in self.class_:
            collection_of_classes[k] = list()

        for index, element in enumerate(selected_column):
            collection_of_classes[y_train[index]].append(element)

        for key, value in collection_of_classes.items():
            self.rule_set[key] = sum(value) / len(value)
Пример #13
0
    def fit(self, x_train, y_train):
        self.class_ = unique_labels(y_train)
        opt_acc = 0

        # Discretizando
        bins = preprocessing.KBinsDiscretizer(
            n_bins=([len(self.class_)] * len(x_train[0])),
            encode='ordinal',
            strategy='uniform').fit(x_train).transform(x_train)

        # Fazendo as tabelas
        for i in range(0, len(x_train[0])):
            table = pd.crosstab(bins[:, i],
                                y_train)  # Cria as tabelas de contingencia
            rule = self.gen_rule(table)  # Cria a regra a partir da tabela

            if (self.acc_train(rule, bins[:, i], y_train)) >= opt_acc:
                self.index = i

        # Achando o centroide
        selected_column = x_train[:, self.index]
        class_lists = dict()
        self.rule_set = dict()

        for val in self.class_:
            class_lists[val] = []

        for i in range(0, len(selected_column)):
            class_lists[y_train[i]].append(selected_column[i])

        for key, value in class_lists.items():
            self.rule_set[key] = sum(value) / len(value)
Пример #14
0
def q2():
    #Criação de variável que representa intevalos, semelhante ao pd.cut
    discretizer = preprocessing.KBinsDiscretizer(n_bins=10,
                                                 encode='ordinal',
                                                 strategy='quantile')
    discretizer.fit(countries[['Pop_density']])
    return int(sum(score_bins[:, 0] >= 9))
Пример #15
0
def to_categorial(data_pd_series, n_bins=10):
    bins_transform = data_pd_series.to_numpy().reshape(-1, 1)
    descritizer = preprocessing.KBinsDiscretizer(
        n_bins=n_bins).fit(bins_transform)
    tmp1 = pd.DataFrame(
        descritizer.transform(bins_transform).todense()).stack()
    return pd.Series(pd.Categorical(
        tmp1[tmp1 != 0].index.get_level_values(1))), descritizer.bin_edges_
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:  # inputs: m x n numpy array
        if self._fitted:
            # get discrete bins from training data
            discTrainset = RelationSet(self._training_inputs, self._training_outputs.reshape(-1, 1))
            discTrainset.impute()
            discTrainset.discretize()
            discTrainset.remove()
            bins = discTrainset.NUM_STATES
            
            # convert categorical values to numerical values in testing data
            metadata = inputs.metadata
            [m, n] = inputs.shape
            X_test = np.zeros((m, n))
            for column_index in metadata.get_elements((metadata_base.ALL_ELEMENTS,)):
                if column_index is metadata_base.ALL_ELEMENTS:
                    continue
                column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
                semantic_types = column_metadata.get('semantic_types', [])
                if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types:
                    LE = preprocessing.LabelEncoder()
                    LE = LE.fit(inputs.iloc[:, column_index])
                    X_test[:, column_index] = LE.transform(inputs.iloc[:, column_index])
                elif 'http://schema.org/Text' in semantic_types:
                    pass
                else:
                    temp = list(inputs.iloc[:, column_index].values)
                    for i in np.arange(len(temp)):
                        if bool(temp[i]):
                            X_test[i, column_index] = float(temp[i])
                        else:
                            X_test[i, column_index] = 'nan'
            discTestset = RelationSet(X_test, [])
            discTestset.impute()
            X_test = discTestset.data
            index_list = np.setdiff1d(np.arange(discTrainset.num_features),np.array(discTrainset.removeIdx))
            X_test = X_test[:, index_list]
            est = preprocessing.KBinsDiscretizer(n_bins=bins,encode='ordinal',strategy='uniform')
            est.fit(X_test)
            X_test = est.transform(X_test)
            t = time.time()
            output = self._clf.predict(X_test)
            self._time = time.time() - t + self._time
            if min(self._training_outputs) == 1:
                output = output + 1
            # label decode
            output = self._LEoutput.inverse_transform(output)
            
            # update metadata
            output = container.DataFrame(output, generate_metadata=False, source=self)
            output.metadata = inputs.metadata.clear(source=self, for_value=output, generate_metadata=True)
            
            for column_index, column_metadata in enumerate(self._target_columns_metadata):
                output.metadata = output.metadata.update_column(column_index, column_metadata, source=self)


            return CallResult(output)
        else:
            raise ValueError('Model should be fitted first.')
Пример #17
0
def q2():
    est = preprocessing.KBinsDiscretizer(n_bins=10,
                                         encode='ordinal',
                                         strategy='quantile')
    pop_density_10bins = est.fit_transform(countries[['Pop_density']])
    plt.hist(pop_density_10bins)
    above_90 = np.sum(pop_density_10bins >= 9)
    # (pop_density_10bins > np.percentile(pop_density_10bins, 90)).sum()
    return above_90
    def __init__(self, X_train, y_train):
        """
        Creates a new SalaryPredictor trained on the given features from the
        preprocessed census data to predicted salary labels. Performs and fits
        any preprocessing methods (e.g., imputing of missing features,
        discretization of continuous variables, etc.) on the inputs, and saves
        these as attributes to later transform test inputs.
        
        :param DataFrame X_train: Pandas DataFrame consisting of the
        sample rows of attributes pertaining to each individual
        :param DataFrame y_train: Pandas DataFrame consisting of the
        sample rows of labels pertaining to each person's salary
        """
        # [!] TODO
        ####
        #defined columns
        columns_to_encode=["work_class","education","marital","occupation_code","relationship","race","sex","country"]
        #columns_to_scale=["age","education_years","capital_gain","capital_loss","hours_per_week"]
        columns_to_scale=["age","education_years","hours_per_week"]
        columns_to_leave= ["capital_gain","capital_loss"]
        self.columns_with_missing=["work_class","occupation_code","country"]
        #get the categories for each column
        cat = []
        for col in X_train.columns:
            if pd.api.types.is_string_dtype(X_train[col]):
                cat.append(X_train[col].unique())

        #print(cat)
        self.imp = SimpleImputer(missing_values = "?", strategy="most_frequent")
        imp_columns = self.imp.fit_transform(X_train[columns_to_encode])
        
        self.ohe = preprocessing.OneHotEncoder(  handle_unknown = "ignore",categories = cat,sparse = False)
        self.le2 = preprocessing.LabelEncoder()
        self.Kbin = preprocessing.KBinsDiscretizer(n_bins=[2,2,2],encode="ordinal")
        self.scale = preprocessing.StandardScaler()
        #scaled_columns = self.scale.fit_transform(X_train[columns_to_scale])
        #ds_columns = self.Kbin.fit_transform(scaled_columns)
        scaled_columns = self.scale.fit_transform(X_train[columns_to_leave])
        ds_columns = self.Kbin.fit_transform(X_train[columns_to_scale])
        #print(X_train)
        encoded_columns = self.ohe.fit_transform(imp_columns)
        #print(X_train)
        #print(encoded_columns)
        #for col in encoded_columns:
            #print(col)
        #   if col == "work_class" or col == "occupation_code" or col == "country":
        #        encoded_columns[col].replace([0],-1)
        processed_data = np.concatenate([ds_columns,encoded_columns,scaled_columns],axis=1)
        #self.imp = SimpleImputer(missing_values = -1, strategy="most_frequent")
        #X_train = self.imp.fit_transform(processed_data)
        self.le = preprocessing.LabelEncoder()
        #y_process = self.le.fit_transform(y_train)
        #print(y_process)
        self.clf = LogisticRegression(max_iter=1000).fit(processed_data,y_train)
Пример #19
0
    def predict(self, x_test):
        predict = list()

        bins = [len(self.class_)] * len(x_test[0])
        bins = preprocessing.KBinsDiscretizer(
            n_bins=bins, encode='ordinal',
            strategy='uniform').fit(x_test).transform(x_test)

        for element in bins[:, self.predict_table_index]:
            predict.append(self.predict_table[element])

        return predict
Пример #20
0
 def fit(self, data):
     # row = data[self.k].values
     # X = np.array([x for x in X]).reshape(-1, 1)
     # bins = np.repeat(n_bins, X.shape[1])  # e.g. [5,3] for 2 features
     # encode to integers
     # quantile: each bin contains approx. the same number of features
     strategy = 'uniform' if util.data.is_int(
         data[self.k]) else 'quantile'
     self.est = preprocessing.KBinsDiscretizer(
         n_bins=self.n_bins, encode='onehot', strategy=strategy)
     self.est.fit(data[self.k].values.reshape(-1, 1))
     self.n_bins = self.est.bin_edges_[0].size
Пример #21
0
def test_Transformer(nasdaq):
    nasdaq = apply_robust(nasdaq)
    new_nasdaq = []
    for timeseries in nasdaq:
        if not (timeseries[-1] > 0.9999 or timeseries[-1] < 0.0001):
            new_nasdaq.append(timeseries)
    nasdaq = np.array(new_nasdaq)

    X, Y = nasdaq[:, :-1], nasdaq[:, -2:]

    discretizer = preprocessing.KBinsDiscretizer(n_bins=63, encode='ordinal')
    discretizer.fit(X)
    X = discretizer.transform(X)

    Y = np.diff(Y, axis=1)
    Y = np.sign(Y)
    Y = np.where(Y <= 0, Y * 0, Y)

    print(Y.shape)
    base_winrate = np.sum(Y) / len(Y)
    print(base_winrate)

    train_size = int(0.8 * len(X))
    trainX, trainY = X[:train_size], Y[:train_size]
    validX, validY = X[train_size:], Y[train_size:]
    print(trainX.shape, trainY.shape)

    learning_rate = CustomSchedule(d_model)
    optimizer = tf.keras.optimizers.Adam(learning_rate,
                                         beta_1=0.9,
                                         beta_2=0.98,
                                         epsilon=1e-9)
    loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)

    model = Transformer_Encoder(num_layers=6,
                                d_model=64,
                                num_heads=8,
                                dff=128,
                                input_vocab_size=64,
                                target_vocab_size=1,
                                rate=dropout_rate)
    checkpoint_path = "./SavedModel/Transformer/Transformer_FXTM.h5"
    checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                    save_weights_only=True,
                                                    save_best_only=True)
    model.compile(optimizer=optimizer, loss=loss_object, metrics=['accuracy'])
    model.build(input_shape=(None, 64))
    history = model.fit(trainX,
                        trainY,
                        validation_data=(validX, validY),
                        batch_size=256,
                        epochs=EPOCHS,
                        callbacks=[checkpoint])
Пример #22
0
 def discretize(self):
     disc = preprocessing.KBinsDiscretizer(
         n_bins=
         10,  # Watch here: you might have to tweak this parameter a bit
         encode=
         "ordinal",  # I'm currently looking for a way to make this process automatic
         strategy="uniform"
     )  # If the data is standardized and rescaled though, 10 might be a good
     continous_df = self.df[self.continous_vars]  # first attempt
     not_continous_df = self.df.drop(columns=self.continous_vars)
     continous_df = pd.DataFrame(disc.fit_transform(continous_df),
                                 columns=continous_df.columns)
     self.df = continous_df.join(not_continous_df)
     return self
Пример #23
0
 def discritizeData(self, features):
     """
     Discritization des données
     Args:
         features:           Array de données
     Return:
         xKBinsDiscretizer:  Array de données discritizé
     """
     est = preprocessing.KBinsDiscretizer(n_bins=3,
                                          encode='ordinal',
                                          strategy='uniform')
     est.fit(features)
     xKBinsDiscretizer = est.transform(features)
     return xKBinsDiscretizer
Пример #24
0
    def predict(self, x_test):
        predict = list()

        bins = [len(self.class_)] * len(x_test[0])
        bins = preprocessing.KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='uniform').fit(x_test).transform(x_test)

        for element in bins[:,self.predict_table_index]:
            if element in self.predict_table:
                predict.append(self.predict_table[element])
            else:
                predict.append(choice(list(self.predict_table.items()))[1])

            self.predict_table = self.gen_table_rules(self.original_bin)

        return predict
def discretiza_con_kmeans(dataframe, n):
    '''
    Recibe un dataframe y un numero de intervalos en el que queramos dividir
    dicho dataframe.
    
    Devuelve el dataframe dividido en n intervalos por la tecnica de kmeans.
    '''

    discretizado = preprocessing.KBinsDiscretizer(
        n_bins=n, encode='ordinal',
        strategy="uniform").fit_transform(dataframe)
    datas = pd.DataFrame(discretizado)
    datas.index = dataframe.index
    datas.columns = dataframe.columns
    return datas
Пример #26
0
    def predict(self, x_test):
        predict = []

        # Discretizando
        bins = preprocessing.KBinsDiscretizer(
            n_bins=([len(self.class_)] * len(x_test[0])),
            encode='ordinal',
            strategy='uniform').fit(x_test).transform(x_test)

        x_test = bins[:, self.index]

        for i in range(0, len(x_test)):
            predict.append(self.rule_set[x_test[i]])

        return predict
Пример #27
0
 def kbins_discretizer(one_feature, n_bins=3, strategy='uniform'):
     '''
     strategy
         uniform
         All bins in each feature have identical widths.
         quantile
         All bins in each feature have the same number of points.
         kmeans
         Values in each bin have the same nearest center of a 1D k-means cluster.
         Method used to encode the transformed result.
     '''
     kbd = preprocessing.KBinsDiscretizer(n_bins=[n_bins], encode='ordinal', strategy=strategy)
     feature_transformed = kbd.fit_transform(one_feature).reshape(-1,)
     print(kbd.n_bins_)
     print(kbd.bin_edges_)
     return feature_transformed
Пример #28
0
def k_bins(data, n_bins, encoder = "ordinal", strategy = "quantile"):
    """
    分箱离散化
    * encode:
        - "ordinal"
        - "onehot"
        - "onehot-dense"
    * strategy:
        - "uniform"
        - "quantile"
        - "kmeans"
    """
    est = preprocessing.KBinsDiscretizer(n_bins = n_bins, encoder = encoder, strategy = strategy)
    transformed_data = est.fit_transform(data)

    return transformed_data
Пример #29
0
    def descretization(self):
        try:
            for i in range(len(self.x)):
                temp = self.dataset[self.x[i]].values.reshape(-1, 1)
                est  = preprocessing.KBinsDiscretizer(n_bins=5, encode='ordinal',strategy='uniform')
                est.fit(temp)
                x_scaled=est.transform(temp)
                print(x_scaled)
                self.dataset[self.x[i]] = x_scaled


            heads=self.dataset.head(10).to_numpy()
        
            for i in range(10):
                for j in range(len(self.dataset.columns)):
                    self.table.setItem(i,j, QTableWidgetItem(str(heads[i][j])))

        except Exception as e:
            print(repr(e))  
Пример #30
0
 def fit(self, x_train, y_train):
     discretizer = preprocessing.KBinsDiscretizer(
         n_bins=2 * len(unique_labels(y_train)),
         encode='ordinal',
         strategy='kmeans')
     discretizer.fit(x_train)
     self.discretizer = discretizer
     df_data = (pd.DataFrame(data=discretizer.transform(x_train)))
     df_data['classe'] = y_train
     contingency_df = {}
     best_score = float('-inf')
     best_feature = None
     for col in list(df_data.iloc[:, :-1]):
         contingency_df[col] = pd.crosstab(df_data['classe'], df_data[col])
         score_feature = contingency_df[col].agg('max').sum()
         if (score_feature > best_score):
             best_feature = col
     self.best_feature = best_feature
     self.prob_table = contingency_df[col].apply(lambda x: x / sum(x)).T