Exemplo n.º 1
0
    def binary_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            handle_unknown: str
                options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
                an extra column will be added in if the transform matrix has unknown categories.  This can cause
                unexpected changes in dimension in some cases.
            handle_missing: str
                options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
                an extra column will be added in if the transform matrix has nan values.  This can cause
                unexpected changes in dimension in some cases.
        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")

        encoder = BinaryEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                handle_unknown=handle_unknown, handle_missing=handle_missing)

        res = encoder.fit_transform(X, y)

        return res
Exemplo n.º 2
0
class BinaryCipher:
    def __init__(self, no_of_cols):
        self._binary_encoder = BinaryEncoder(cols=list(range(no_of_cols)),
                                             return_df=False)

    def encode(self, incoming_data, incoming_labels=None):
        print('\nencode')
        print('incoming_data.shape: {}'.format(incoming_data.shape))
        result = np.array([])
        if incoming_labels is None:  # predict
            result = self._binary_encoder.transform(incoming_data)
        else:  #fit
            print('incoming_labels.shape: {}'.format(incoming_labels.shape))
            result = self._binary_encoder.fit_transform(
                incoming_data, incoming_labels)

        print('result.shape: {}\n'.format(result.shape))
        return result
def ProcessTimestamp(dataframe, columns, method, mode):
    columns = [column for column in columns if column != 'timestamp']
    dataframe['month'] = dataframe['timestamp'].dt.month - 1
    dataframe['mday'] = dataframe['timestamp'].dt.day - 1
    dataframe['wday'] = dataframe['timestamp'].dt.weekday
    dataframe['hour'] = dataframe['timestamp'].dt.hour
    if method == 'ordinal':
        tcolumns = ['month', 'mday', 'wday', 'hour']
    elif method == 'binary':
        binaryenc = BinaryEncoder(cols=['month', 'mday', 'wday', 'hour'])
        timecategory = pd.DataFrame.from_records({
            'month':
            np.arange(12).tolist() + np.zeros((19)).tolist(),
            'mday':
            np.arange(31),
            'wday':
            np.arange(7).tolist() + np.zeros((24)).tolist(),
            'hour':
            np.arange(24).tolist() + np.zeros((7)).tolist()
        })
        timecategory.loc[:, [
            'month', 'mday', 'wday', 'hour'
        ]] = timecategory.loc[:, ['month', 'mday', 'wday', 'hour']].astype(
            'category')

        binaryenc.fit(timecategory)
        timeframe = binaryenc.transform(
            dataframe.loc[:, ['month', 'mday', 'wday', 'hour']])
        dataframe = pd.concat([dataframe, timeframe], axis=1)
        tcolumns = [f'month_{i}' for i in range(1, 5)] + [
            f'mday_{i}' for i in range(1, 6)
        ] + [f'wday_{i}'
             for i in range(1, 4)] + [f'hour_{i}' for i in range(1, 6)]
    else:
        raise NotImplementedError(
            f'must implement {method} for timestamp processing')

    dataframe.drop(columns=['timestamp'], inplace=True)
    dataframe = dataframe[tcolumns + columns]
    return dataframe, tcolumns
Exemplo n.º 4
0
def create_preprocessing_pipe(X: pd.DataFrame,
                              y: pd.Series = None) -> Pipeline:
    """
    Create a pipeline object with elements fitted to the training data.
    :param X: The dataframe of features
    :param y: The target series
    :return: a pipeline object
    """
    pipe = Pipeline([('bin_encoder', BinaryEncoder(cols=['brewery_name'])),
                     ('scaler', StandardScaler())])

    pipe.fit(X)

    return pipe
Exemplo n.º 5
0
enc = OneHotEncoder()
df['var_to_encode'] = enc.fit_transform(df['var_to_encode'])
#Use pandas get_dummies for categories encoded as strings
pd.get_dummies(df, columns=['col1', 'col2'])

#OrdinalEncoding for categories which have an order (example: low/medium/high)
map_dict = {'low': 0, 'medium': 1, 'high': 2}
df['var_oe'] = df['var'].apply(lambda x: map_dict[x])
#We can also do it with sklearn's LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['var_oe'] = le.fit_transform(df['var'])

#BinaryEncoder when we have many categories in one variable it means creating many columns with OHE. With Binary encoding we can do so with many less columns by using binary numbers. Use only when there is a high cardinality in the categorical variable.
from category_encoders.binary import BinaryEncoder
be = BinaryEncoder(cols=['var'])
df = be.fit_transform(df)

#HashingEncoder
from category_encoders.hashing import HashingEncoder
he = HashingEncoder(cols=['var'])
df = he.fit_transform(df)

#Feature selection: Drop attributes that provide no useful information for the task
#Unsupervised Feature selection before training a model
from sklearn.feature_selection import SelectKBest
bestfeatures = SelectKBest(score_func=chi2, k='all')
fit = bestfeatures.fit(X, Y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(df.columns)
Exemplo n.º 6
0
    data[col].cat.set_categories(data[col].cat.categories,
                                 ordered=False,
                                 inplace=True)

print("Making Null values a seperate category...")
# Representing NaN as an individual category in all the columns
columns = data.columns
for col in columns:
    if data[col].isna().sum() > 0:
        data[col].cat.add_categories("NA", inplace=True)
        data[col].fillna("NA", inplace=True)

summary = data.describe().T
columns_one_hot = summary[summary["unique"] <= 15].index.tolist()
columns_binary = summary[summary["unique"] > 15].index.tolist()
columns_one_hot.remove("HasDetections")

print("One-hot encoding...")
data = pd.get_dummies(data, columns=columns_one_hot)

print("Binary encoding...")
binary_encoder = BinaryEncoder(cols=columns_binary, return_df=True)
data = binary_encoder.fit_transform(data)
added_cols = data.select_dtypes(include=["int64"]).columns.tolist()
data[added_cols] = data[added_cols].astype("uint8")

print("Saving back the processed data...")
dataset.save_as_feather(
    data,
    filename="./microsoft-malware-prediction/processed/train_preprocessed")
Exemplo n.º 7
0
# In[33]:


df = main_df2.merge(severity_scores_df, how='left', left_on = 'ICUSTAY_ID' ,right_on='icustay_id')
df = df.merge(vitals_df, how='inner', on='ICUSTAY_ID')


# In[34]:


# Binary Encoding

from category_encoders.binary import BinaryEncoder

diagnosis_encoder = BinaryEncoder()
diagnosis_binary = diagnosis_encoder.fit_transform(df['ICD9_CODE'].astype(str))

# Create dummies for Gender
gender_df =pd.get_dummies(df ['GENDER'],prefix='gender_', drop_first=True)

# Create dummies for Age Group
age_group_df =pd.get_dummies(df ['ICUSTAY_AGE_GROUP'], prefix='age_group_', drop_first=True)

# Create dummies for Admission Type
admtype_df =pd.get_dummies(df ['ADMISSION_TYPE'],prefix='ADMISSION_TYPE_', drop_first=True)

# Create dummies for Insurance Type
instype_df =pd.get_dummies(df ['INSURANCE'],prefix='INSURANCE_TYPE_', drop_first=True)

# Creat dummies for car_unit
Exemplo n.º 8
0
 def __init__(self, no_of_cols):
     self._binary_encoder = BinaryEncoder(cols=list(range(no_of_cols)),
                                          return_df=False)