示例#1
0
def load_data(train_pkl,
              test_pkl,
              label='is_y2',
              method=None,
              target_pn_ratio=None,
              seed=None):
    train_data = pd.read_pickle(train_pkl)
    test_data = pd.read_pickle(test_pkl)
    print(
        colorize(
            'train-shape={}\t test_shape={}'.format(train_data.shape,
                                                    test_data.shape), 'blue',
            True))
    print(train_data.head())
    print(test_data.head())

    pn_ratio = sum(train_data.is_y2 == 1) / sum(train_data.is_y2 == 0)
    print(colorize('naive-pn-ratio={:.4f}'.format(pn_ratio), 'blue', True))

    if target_pn_ratio:
        method = method or 'up'
        train_data = train_sampling(train_data,
                                    col=label,
                                    method=method,
                                    pn_ratio=target_pn_ratio,
                                    seed=seed)
    train_data = shuffle(train_data, random_state=42)

    print('shuffle:\n', train_data.head(20))
    print(colorize('train-shape={}'.format(train_data.shape), 'blue', True))

    train_y = train_data[label]
    train_x = train_data.drop(columns=[label])
    test_y = test_data[label]
    test_x = test_data.drop(columns=[label])
    assert train_x.isna().sum().sum() == 0
    return train_x, train_y, test_x, test_y
    def _encoder_column(self, data, prefix, prefix_sep, dtype):
        if dtype is None:
            dtype = np.uint8

        maps = self.mapping.get(prefix, {}) or self.tail_mapping.get(
            prefix, {})
        dummy_strs = cycle([u'{prefix}{sep}{val}'])
        dummy_cols = [
            dummy_str.format(prefix=prefix, sep=prefix_sep, val=str(v))
            for dummy_str, v in zip(dummy_strs, maps.keys())
        ]

        out_shape = (len(data), len(dummy_cols))

        if isinstance(data, Series):
            index = data.index
        else:
            index = None
        data.reset_index(drop=True, inplace=True)  # data :Series

        data2 = data.map(maps)
        null = data2[data2.isna()].index
        data2 = data2[data2.notna()]
        if not null.empty:
            print(
                colorize(
                    "{} only exist in test data column '{}'".format(
                        set(data[null].values), prefix), 'cyan', True))
        row_idxs = data2.index.tolist()
        col_idxs = data2.values.tolist()
        sarr = csr_matrix((np.ones(len(row_idxs)), (row_idxs, col_idxs)),
                          shape=out_shape,
                          dtype=dtype)
        if pd.__version__ >= '0.25':
            out = pd.DataFrame.sparse.from_spmatrix(
                sarr, index=index,
                columns=dummy_cols)  #sparse accessor, out.sparse.to_dense()
            # dense.astype('Sparse[int]'), dense.astype(pd.SparseDtype(int,fill_value=0))
            # out.astype(pd.SparseDtype(int, fill_value=0))
        else:
            out = pd.SparseDataFrame(sarr,
                                     index=index,
                                     columns=dummy_cols,
                                     default_fill_value=0,
                                     dtype=dtype)
        return out.astype(dtype)  # care of row and columns not covered by sarr
示例#3
0
    train_data = train_sampling(train_data,
                                col='is_y2',
                                method='down',
                                pn_ratio=0.2,
                                seed=2019)
    # train_data = train_sampling(train_data, col='is_y2', method='up', pn_ratio=0.5,seed=2019)
    pn_ratio = sum(train_data.is_y2 == 1) / sum(train_data.is_y2 == 0)

    print(train_data.head())
    print(val_data.head())
    train_data = train_data.values
    np.random.shuffle(train_data)
    # np.random.shuffle(train_data)
    n_state = train_data.shape[1] - 1
    n_action = 1
    print(colorize('pn-ratio={}'.format(pn_ratio), 'cyan', True))
    print(
        colorize('action_dim=%d, state_dim=%d' % (n_action, n_state), 'cyan',
                 True))
    print(
        colorize(
            'train_shape={}, val_shape={}'.format(train_data.shape,
                                                  val_data.shape), 'cyan',
            True))

    checkpoint_queen = MinHeap(max_size=5, compare_key=operator.itemgetter(0))
    logger = Logger(output_dir='../assets', output_fname='ddpg_epoch_log')

    config = get_session_config(frac=0.4, allow_growth=True, gpu="0")
    ddpg = DDPG(n_state=n_state,
                n_action=n_action,
示例#4
0
    net.optimizer = Adam(
        lr=0.001)  # Adam(lr=0.001)  SGD(lr=0.001,nesterov=False)
    history = net.fit(train_x.values,
                      train_y.values,
                      batch_size=64,
                      epochs=200,
                      verbose=1,
                      shuffle=True,
                      validation_data=(val_x.values, val_y.values),
                      class_weight=None)

    df = pd.DataFrame(history.history)
    df.to_csv('../assets/misc_' + datetime.now().strftime('%m%d_%H%M') +
              '.csv',
              index=False)
    print(colorize('done'.center(50, '-'), 'green', True))

# model = Sequential()
# l1 =0
# l2 =0
# model.add(Dense(units=64, activation='relu',input_dim=train_x.shape[1],
#                 kernel_regularizer=None,            #regularizers.l1_l2(l1=l1, l2=l2)
#                 kernel_initializer = 'he_normal',name='fc1'))
#
# model.add(Dense(units=64, activation=None, kernel_regularizer=None,
#                 kernel_initializer='he_normal',name='fc2'))
# model.add(BatchNormalization())
# model.add(Activation(activation='relu'))
#
# model.add(Dense(units=64, activation=None, kernel_regularizer=None,
#                 kernel_initializer='he_normal'))
    def fit(self, X, y=None, cols_to_encode=None, extra_numeric_cols=None):
        """
        parameter
        ----------
        X: DataFrame  to generate one-hot-encoder rule
        y: label column in DataFrame X if provided

        cols_to_encoders: specify the columns  to be  encoded
        extra_numeric_cols: if cols_to_encoder is provided this param will
           not be used, otherwise all object columns and extra_numeric_cols
           will be encoded.
        """
        print('fitting....')
        assert isinstance(X, DataFrame), 'X should be DataFrame object'
        columns = X.columns.tolist()
        if y is not None:
            if y not in columns:
                raise ValueError('y is not in X.columns during {}.fit'.format(
                    self.__class__.__name__))
            else:
                columns.remove(y)

        self._dim = len(columns)

        # drop null cols
        nulls = X.isnull().sum(axis=0) / len(X)
        drop_null_cols = nulls[nulls >= self.drop_na_ratio].index.tolist()
        X = X.drop(columns=drop_null_cols)
        self.drop_cols.extend(drop_null_cols)
        print(
            colorize(
                'drop_null_cols({})={}'.format(len(drop_null_cols),
                                               drop_null_cols), 'blue', True))

        # get encoder columns
        if cols_to_encode is None:
            cols = self.get_encode_cols(X)
            cols += list(extra_numeric_cols) if extra_numeric_cols else []
        else:
            cols = cols_to_encode
        if y in cols:
            cols.remove(y)
        cols = sorted(list(set(cols)), key=columns.index)

        # convert na to sentinel value
        df = X[cols].fillna(self.na_sentinel, downcast='infer').astype(str)

        # generate rules
        colvals = {}
        drop_cat_cols = []
        for col in cols:
            values = df[col].unique().tolist()
            if str(self.na_sentinel) in values and not self.dummy_na:
                values.remove(str(self.na_sentinel))
            if 1 < len(values) <= self.category_threshold:
                colvals[col] = values
            else:
                drop_cat_cols.append(col)
        print(
            colorize(
                'drop_cat_cols({})={}'.format(len(drop_cat_cols),
                                              drop_cat_cols), 'blue', True))
        self.drop_cols.extend(drop_cat_cols)
        self.encode_cols = list(sorted(colvals.keys(), key=df.columns.get_loc))
        self.double_cols = [
            col for col in columns
            if col not in self.encode_cols and col not in self.drop_cols
        ]

        for col in self.encode_cols:
            vals = colvals[col]
            self.mapping[col] = OrderedDict(
                {val: i
                 for i, val in enumerate(vals)})
        # cats = df.apply(lambda x: x.unique().__len__(), axis=0)
        # subs = 0 if self.dummy_na else df.apply(lambda x: str(self.na_sentinel) in x.values)
        # cats -= subs
        # self.drop_cols = cats[(cats>=self.category_threshold)|(cats<=1)].index.tolist()
        # self.encode_cols = cats[~cats.index.isin(self.drop_cols)].index.tolist()  # cats.index.difference(drop_cols), turns changed
        # self.double_cols = [col for col in columns if col not in self.encode_cols and col not in self.drop_cols]

        # long-tail-distribution(in double cols)
        if self.long_tail_preproc is not None:
            skews = X[self.double_cols].skew(skipna=True)
            thres = self.kwargs.get('skew_threshold', 5)
            lt_cols = skews[abs(skews) > thres].index.tolist()
            print(
                colorize('long-tail-cols({})={}'.format(len(lt_cols), lt_cols),
                         'blue', True))

            if self.long_tail_preproc == 'discretize':
                drop_tail_cols = []
                self.tail_bins = {}
                self.tail_mapping = {}
                self.long_tail_cols = []
                self.buckets = self.kwargs.get('buckets', 5)
                self.labels = list(map(chr,
                                       ord('a') + np.arange(self.buckets)))
                for col in lt_cols:
                    if X[col].unique().size < self.buckets:
                        drop_tail_cols.append(col)
                        continue
                    _, bins = pd.qcut(X[col],
                                      q=self.buckets,
                                      labels=None,
                                      retbins=True,
                                      duplicates='drop')
                    if bins.size < 3:  # at least 2 bins
                        drop_tail_cols.append(col)
                        continue
                    self.tail_bins[col] = bins.tolist()
                    self.tail_mapping[col] = OrderedDict(
                        {chr(ord('a') + i): i
                         for i in range(bins.size - 1)})
                    if (X[col].isna().sum() / len(X)) > 0.01:
                        self.tail_mapping[col]['null'] = bins.size - 1
                    self.double_cols.remove(col)
                    self.long_tail_cols.append(col)

                self.drop_cols.extend(drop_tail_cols)
                self.encode_cols.extend(self.long_tail_cols)

                for col in drop_tail_cols:
                    self.double_cols.remove(col)
                print(
                    colorize(
                        'drop_tail_cols({})={}'.format(len(drop_tail_cols),
                                                       drop_tail_cols), 'blue',
                        True))
            else:
                # TODO(yuanyuqing163): implement boxcox transformation
                raise ValueError(
                    "boxcox transformation hasn't implemented yet")

        self.scaler['mean'] = X[self.double_cols].mean()
        self.scaler['std'] = X[self.double_cols].std()
        if 'minmax' in self.double_preproc:
            self.scaler['min'] = X[self.double_cols].min()
            self.scaler['max'] = X[self.double_cols].max()
        elif 'normal' not in self.double_preproc:
            raise ValueError(
                'double_process_type = {} not supported yet'.format(
                    self.double_preproc))

        return self
    def transform(self, X, y=None, dtype=None, inplace=False):
        """
        parameter
        -----------
        dtype: specifies the dtype of encoded value
        """
        print('transform....')
        assert isinstance(X, DataFrame), 'X shoule be DataFrame object'
        columns = X.columns.tolist()
        target_df = []
        if y is not None:
            if y not in columns:
                raise ValueError("'y label {}' not in X".format(y))
            else:
                columns.remove(y)
                target_df = [X.loc[:, [y]]]

        assert self._dim == len(columns)

        diff_cols = set(self.encode_cols +
                        self.double_cols).difference(columns)
        if len(diff_cols) > 0:
            raise ValueError(
                "X not includes encoded columns '{}'".format(diff_cols))

        if not inplace:
            X = X.copy()  # X=X.copy(deep=True)
            gc.collect()

        X.drop(self.drop_cols, axis=1, inplace=True)

        X[self.double_cols] = X[self.double_cols].fillna(self.scaler['mean'])
        if 'normal' in self.double_preproc:
            X[self.double_cols] = (X[self.double_cols] - self.scaler['mean']
                                   ) / (self.scaler['std'] + self.epsilon)

        #TODO:truncate interval [min,max]
        if 'minmax' in self.double_preproc:
            lbound = self.kwargs.get('lbound', 0)
            hbound = self.kwargs.get('hbould', 1)
            X[self.double_cols] = lbound + (
                X[self.double_cols] - self.scaler['min']
            ) / (self.scaler['max'] - self.scaler['min']) * (hbound - lbound)

        # long_tail_feature
        if self.long_tail_preproc == 'discretize':
            print('long_tail_discretize.....')
            for col in self.long_tail_cols:
                buckets = len(self.tail_bins[col]) - 1
                idx = list(range(buckets + 2))
                val = [
                    self.labels[0], *self.labels[:buckets],
                    self.labels[buckets - 1]
                ]
                idx2val = dict(zip(idx, val))
                X[col] = X[col].map(
                    lambda x: np.searchsorted(self.tail_bins[col], x),
                    na_action='ignore')
                X[col] = X[col].map(idx2val)
                if 'null' in self.tail_mapping[col]:
                    X[col] = X[col].fillna('null')
                else:
                    print(
                        colorize(
                            "'null' not long tail in '{}'=#{}".format(
                                col, X[col].isna().sum()), 'cyan', True))

        elif self.long_tail_preproc == 'boxcox':
            raise ValueError('unsupported long_tail_preproc type'.format(
                self.long_tail_preproc))

        data_to_encode = X[self.encode_cols].fillna(
            self.na_sentinel, downcast='infer').astype(str)
        with_dummies = [X[self.double_cols]]
        prefix = self.encode_cols
        prefix_sep = cycle(['_'])
        print('encoding....')
        for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix,
                                   prefix_sep):
            # col is (col_name, col_series) type
            dummy = self._encoder_column(col[1], pre, sep, dtype=dtype)
            with_dummies.append(dummy)

        result = pd.concat(with_dummies + target_df, axis=1)

        return result