Exemplo n.º 1
0
def sector_filtering(portfolio):

    df_reduced['labels'] = OrdinalEncoder(cols=['setor']).fit_transform(df_dummy['setor'])
    df_reduced['labels2'] = OrdinalEncoder(cols=['de_faixa_faturamento_estimado_grupo']).fit_transform(df_dummy['de_faixa_faturamento_estimado_grupo'])

    X = pd.concat([df_id,df_reduced], axis='columns')

    # portfolio information
    pf_filled = X.loc[X['id'].isin(portfolio['id'].values)]

    # part of the market that shares the same clusters

    pf_out = X.loc[X['labels'].isin(list(pf_filled['labels'].unique()))]
    pf_out = pf_out.loc[X['labels2'].isin(list(pf_filled['labels2'].unique()))]

    # customer that are not yet on the company's portfolio
    sample = pf_filled.iloc[:,:num_components-1].sample(frac=0.7, random_state=42) # num_comp-1 for it not to account for the labels in the dot product

    pf_rec = pf_out.loc[~pf_out['id'].isin(sample['id'])]
    pf_rec = pf_rec.iloc[:,:num_components-1] # num_comp-1 for it not to account for the labels in the dot product

    cosine_sim = cosine_similarity(pf_rec.drop(['id'],axis='columns'),sample.drop(['id'],axis='columns'))
    cosine_sim = np.sum(cosine_sim, axis=1) # best results with sum. amax and mean already tested

    pf_rec['score'] = list(cosine_sim)

    # list new leads to recommend
    market = list(pf_rec.sort_values('score', ascending=False)['id'])
    test = list(pf_filled.loc[~pf_filled['id'].isin(sample['id'])]['id'])

    return market, test
Exemplo n.º 2
0
    def test_display_dataset_analysis_3(self, mock_correlation_matrix):
        """
        Test we don't have a problem when only categorical features
        """
        df = self.df.copy()
        df['x1'] = 'a'
        df['x2'] = df['x2'].astype(str)
        encoder = OrdinalEncoder(
            cols=['x1', 'x2'],
            handle_unknown='ignore',
            return_df=True).fit(df)

        df = encoder.transform(df)

        clf = cb.CatBoostClassifier(n_estimators=1).fit(df[['x1', 'x2']], df['y'])
        xpl = SmartExplainer()
        xpl.compile(model=clf, x=df[['x1', 'x2']])
        report = ProjectReport(
            explainer=xpl,
            project_info_file=os.path.join(current_path, '../../data/metadata.yaml'),
            x_train=df[['x1', 'x2']],
        )

        report.display_dataset_analysis()

        self.assertEqual(mock_correlation_matrix.call_count, 0)
def categoryEncode(df, cols=None, mode="binary"):
    if(mode == "ordinal"):
        encoder = OrdinalEncoder(cols=cols, handle_missing="return_nan", handle_unknown="return_nan")
    elif(mode == "binary"):
        encoder = BinaryEncoder(cols=cols)
    df_new = encoder.fit_transform(df)
    return df_new
Exemplo n.º 4
0
    def _fit_learn_categorical_variables(train_df: DataFrame):
        """
        Train encoder to transform categorical variables in numerical ones.

        Parameters
        ----------
        train_df: DataFrame
            dataframe with categorical variables

        Returns
        -------
            encoder: OrdinalEncoder
                encoder to change the
        """

        cols_to_encode = ['country', 'province', 'region_1',
                          'region_2', 'taster_name', 'taster_twitter_handle',
                          'variety']

        encoder = OrdinalEncoder(cols=cols_to_encode, return_df=True)

        # Fit Data
        encoder.fit(train_df)

        return encoder
Exemplo n.º 5
0
    def encode_cat_col(self):
        enc = OrdinalEncoder(return_df=False).fit(self.categ_col)
        self.categ_col = enc.transform(self.categ_col)

        # DEBUG
        print(self.DS)
        print(self.categ_col)
Exemplo n.º 6
0
def fit_label(input_df: pd.DataFrame, cols: List[str], na_value: Any = None):
    """
    Creates the label encoder by fitting it through the given DataFrame
    NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value.
    Args:
        input_df: DataFrame used to fit the encoder
        cols: List of categorical columns to be encoded
        na_value: Default null value for DataFrame

    Returns:
        result_df: encoded input_df DataFrame
        model : encoder model to be passed to `transform_label` method
    """
    df = input_df.copy()

    if na_value is not None:
        for col in cols:
            df[col] = df[col].replace({na_value: np.nan})

    encoder = OrdinalEncoder(cols=cols)
    encoder = encoder.fit(df)
    for idx in range(len(encoder.mapping)):
        encoder.mapping[idx]["mapping"].loc[np.nan] = -2

    result_df = encoder.transform(df)

    for col in cols:
        result_df[col] = result_df[col].replace({-1: 0, -2: 0})
        result_df[col] = result_df[col].astype(int)

    model = {"encoder": encoder, "cols": cols, "na_value": na_value}
    return result_df, model
Exemplo n.º 7
0
    def __init__(self,
                 sparksess=None,
                 logdir='/encoder',
                 handle_unknown='-99999',
                 save_encoder=False):
        self.spark = sparksess
        self.logdir = logdir
        self.save_encoder

        self.ordinal_encoder_features = []
        self.onehot_encoder_features = []
        self.count_encoder_features = []
        self.target_encoder_features = []
        self.ordinal_encoder = OrdinalEncoder(
            cols=self.ordinal_encoder_features,
            return_df=True,
            handle_unknown=handle_unknown)
        self.onehot_encoder = OneHotEncoder(cols=self.onehot_encoder_features,
                                            return_df=True,
                                            handle_unknown=handle_unknown)
        self.count_encoder = CountEncoder(cols=self.count_encoder_features,
                                          return_df=True,
                                          handle_unknown=handle_unknown)
        self.target_encoder = TargetEncoder(cols=self.target_encoder_features,
                                            return_df=True,
                                            handle_unknown=handle_unknown)
    def encode_cat_col(self):  # TODO pandas to numpy
        from category_encoders import OrdinalEncoder
        enc = OrdinalEncoder(return_df=False).fit(self.categ_col)
        self.categ_col = enc.transform(self.categ_col)

        # DEBUG
        print(self.DS)
        print(self.categ_col)
Exemplo n.º 9
0
 def models_to_compare(self) -> Dict[ModelName, Dict]:
     lightgbm_step_categorical_features_params = f"{ModelName.LIGHTGBM.value}__{CATEGORICAL_FEATURE}"
     return {
         ModelName.CATBOOST: {
             TaskName.CLASSIFICATION:
             Pipeline([(ModelName.CATBOOST.value,
                        CatBoostClassifier(
                            cat_features=self.categorical_features_indices,
                            verbose=0))]),
             TaskName.REGRESSION:
             Pipeline([(ModelName.CATBOOST.value,
                        CatBoostRegressor(
                            cat_features=self.categorical_features_indices,
                            verbose=0))])
         },
         ModelName.LIGHTGBM: {
             TaskName.CLASSIFICATION:
             Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()),
                       (ModelName.LIGHTGBM.value, LGBMClassifier())]),
             TaskName.REGRESSION:
             Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()),
                       (ModelName.LIGHTGBM.value, LGBMRegressor())]),
             FIT_PARAMS: {
                 lightgbm_step_categorical_features_params:
                 self.categorical_features
             }
         },
         ModelName.LIGHTGBM_WITH_CATBOOST_ENCODER: {
             TaskName.CLASSIFICATION:
             Pipeline([(ModelName.CATBOOST_ENCODER.value,
                        CatBoostEncoder()),
                       (ModelName.LIGHTGBM.value, LGBMClassifier())]),
             TaskName.REGRESSION:
             Pipeline([(ModelName.CATBOOST_ENCODER.value,
                        CatBoostEncoder()),
                       (ModelName.LIGHTGBM.value, LGBMRegressor())])
         },
         ModelName.XGBOOST_WITH_CATBOOST_ENCODER: {
             TaskName.CLASSIFICATION:
             Pipeline([(ModelName.CATBOOST_ENCODER.value,
                        CatBoostEncoder()),
                       (ModelName.XGBOOST.value, XGBClassifier())]),
             TaskName.REGRESSION:
             Pipeline([(ModelName.CATBOOST_ENCODER.value,
                        CatBoostEncoder()),
                       (ModelName.XGBOOST.value, XGBRegressor())])
         },
         ModelName.XGBOOST: {
             TaskName.CLASSIFICATION:
             Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()),
                       (ModelName.XGBOOST.value, XGBClassifier())]),
             TaskName.REGRESSION:
             Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()),
                       (ModelName.XGBOOST.value, XGBRegressor())])
         }
     }
Exemplo n.º 10
0
class df_OrdinalEncoder(TransformerMixin):
    def __init__(self, handle_unknown='ignore'):
        self.handle_unknown = handle_unknown
        
    def fit(self, X, y=None):
        self.enc = OrdinalEncoder(handle_unknown=self.handle_unknown)
        self.enc.fit(X)
        return self
    
    def transform(self, X):
        X_encoded = self.enc.transform(X)
        X_encoded_df = pd.DataFrame(data=X_encoded, index=X.index, columns=X.columns)
        return X_encoded_df
Exemplo n.º 11
0
def encode_result(df_orig):
    df = df_orig.copy(deep=True)
    mapping_dict = {
        'col': 'result',
        'mapping': {
            'hwin': 1,
            'draw': 2,
            'awin': 3
        }
    }
    ord_enc = OrdinalEncoder(mapping=[mapping_dict])
    df['ordinal_result'] = ord_enc.fit_transform(df[['result']])
    return df
Exemplo n.º 12
0
    def xgb_reg(X_train, y_train, X_test, y_test):
        """
        Simple pipeline Baseline model for using XGB Regressor including a 
        ordinal encoder, standard scaler, simple imputer. This function returns 
        Mean baseline, R^2, and RMSE. If R^2 is negative this means mean 
        baseline is a more effective model
        """
        s1 = pd.Series(y_train)
        s2 = pd.Series(y_test)
        s3 = s1.append(s2)
        mean = np.mean(s3)

        model = make_pipeline(
            OrdinalEncoder(), StandardScaler(),
            SimpleImputer(strategy='median'),
            XGBRegressor(n_estimators=100, n_jobs=-1, max_depth=10))

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        print(f'Mean baseline of target = {mean}')
        print(f'Gradient Boosting R^2 = {r2}')
        print(f'Gradient Boosting RMSE = {rmse}')
        return
Exemplo n.º 13
0
def make_gridsearch(clf, param_grid, params):
    pipe = Pipeline([('encoder', OrdinalEncoder()),
                     ('scaler', StandardScaler()), ('clf', clf)])

    pipe.set_params(**params)
    grid = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3)
    return grid
Exemplo n.º 14
0
    def _encode_categories(self):
        """
        This private method stands for encoding categorical variables. Label encoding used for ordinal categories and
        one-hot encoding used for nominal categories.
        """

        logging.info(f'#{self._index()} - Encoding categorical columns...')
        # get column names for categorical and numerical features
        categorical_vars = self.X.select_dtypes(include='object').columns
        numerical_vars = self.X.columns.difference(categorical_vars)

        ordinal = pd.Index([
            'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
            'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
        ])
        nominal = categorical_vars.difference(ordinal)

        standard_mapping = {
            'NA': 0,
            'Po': 1,
            'Fa': 2,
            'TA': 3,
            'Gd': 4,
            'Ex': 5
        }
        mapping_for_ordinals = [{
            'col': column,
            'mapping': standard_mapping
        } for column in ordinal]

        x_num = self.X[numerical_vars]
        x_test_num = self.X_test[numerical_vars]

        # one hot encode categorical columns
        one_hot_encoder = OneHotEncoder(use_cat_names=True)
        label_encoder = OrdinalEncoder(drop_invariant=True,
                                       mapping=mapping_for_ordinals,
                                       handle_unknown='error')

        x_cat_nom = one_hot_encoder.fit_transform(self.X[nominal])
        x_cat_ord = label_encoder.fit_transform(self.X[ordinal])
        x_test_cat_nom = one_hot_encoder.transform(self.X_test[nominal])
        x_test_cat_ord = label_encoder.transform(self.X_test[ordinal])

        self.X = x_num.join(x_cat_ord).join(x_cat_nom)
        self.X_test = x_test_num.join(x_test_cat_ord).join(x_test_cat_nom)
        logging.info(f'#{self._step_index} - DONE!')
    def _create_feature(cls, conf) -> pd.DataFrame:
        df = Application.get_df(conf)[[
            'SK_ID_CURR', 'TARGET', cls._col1, cls._col2
        ]]
        df = OrdinalEncoder(cols=[cls._col1, cls._col2]).fit_transform(df)
        latent_vectors = lda(cls._n_components, df, cls._col1, cls._col2)

        dic = defaultdict(list)
        for v in latent_vectors:
            for i, s in enumerate(v):
                dic[f"{cls._col1}_LDA_{cls._col2}_dim{i}"].append(s)
        df_latent_vectors = pd.DataFrame(dic)
        return df.merge(df_latent_vectors,
                        how="left",
                        left_on=cls._col1,
                        right_index=True).drop(
                            ['TARGET', cls._col1, cls._col2], axis=1)
Exemplo n.º 16
0
    def __init__(self, df_train: pd.DataFrame, df_valid: pd.DataFrame, df_test: pd.DataFrame, use_columns, label_column):
        encoder = OrdinalEncoder(cols=use_columns, handle_unknown='impute').fit(df_train)
        df_train_X = encoder.transform(df_train).astype('int64')
        df_valid_X = encoder.transform(df_valid).astype('int64')
        df_test_X  = encoder.transform(df_test).astype('int64')

        self.train_X = torch.from_numpy(df_train_X[use_columns].values).long()
        self.train_y = df_train[label_column].values
        self.valid_X = torch.from_numpy(df_valid_X[use_columns].values).long()
        self.valid_y = df_valid[label_column].values
        self.test_X = torch.from_numpy(df_test_X[use_columns].values).long()
        self.test_y = df_test[label_column].values

        field_dims = list(df_train_X[use_columns].max())
        self.field_dims = list(map(add_one, field_dims))

        self.data_num = self.train_X.size()[0]
Exemplo n.º 17
0
def get_train_simple_pre_pipeline():
    columns_pipe = get_columns_pipeline()

    pre_processor_pipe = Pipeline(
        steps=[('ordinal_encoder',
                OrdinalEncoder(cols=['tipodepropiedad', 'provincia', 'ciudad'])
                ), ('columns_pipe', columns_pipe)])
    return pre_processor_pipe
def predict(submit,input1, input2, input3, input4, input5, input6, input7, input8, input9, input10, input11, input12, input13, input14, input15, input16, input17, input18, input19, input20):
    team = {'CONF':input1, 'G':float(input2), 'ADJOE':float(input3), 'ADJDE':float(input4), 'BARTHAG':float(input5), 
        'EFG_O':float(input6), 'EFG_D':float(input7), 'TOR':float(input8), 'TORD':float(input9), 'ORB':float(input10),
        'DRB':float(input11), 'FTR':float(input12), 'FTRD':float(input13), '2P_O':float(input14), '2P_D':float(input15),
        '3P_O':float(input16), '3P_D':float(input17),'ADJ_T':float(input18), 'WAB':float(input19), 'SEED':float(input20)}
    model = make_pipeline(OrdinalEncoder(), XGBClassifier(max_depth=5, learning_rate=0.001, n_estimators=500, n_jobs=-1, objective='multi:sotmax', eval_metric='merror', num_class=8, critereon = 'entropy'))
    model.fit(X_train, y_train)    
    return model.predict([team])
Exemplo n.º 19
0
 def _cat_encoder(self, df):
     # start_time = time.time()
     df = df.fillna(0)
     from category_encoders import OrdinalEncoder
     if self.is_trained == False:
         enca = OrdinalEncoder().fit(df)
         self.catEncoder.append(enca)
     cat = self.catEncoder[0].transform(df)
     # print("cat_encoder______________",time.time()-start_time)
     return cat
Exemplo n.º 20
0
def main():
    print('started experimnent')
    with neptune.create_experiment(
            name='feature engineering',
            tags=['feature-extraction', FEATURE_NAME],
            upload_source_files=get_filepaths(),
            properties={'feature_version': FEATURE_NAME}):
        print('loading data')
        train = load_and_merge(RAW_DATA_PATH, 'train',
                               NROWS)[ID_COLS + V1_COLS + ['isFraud']]
        test = load_and_merge(RAW_DATA_PATH, 'test', NROWS)[ID_COLS + V1_COLS]

        categorical_cols = set(V1_CAT_COLS)
        print('cleaning data')
        email_cols = ['P_emaildomain', 'R_emaildomain']
        train, new_email_cols = clean_email(train, email_cols)
        test, _ = clean_email(test, email_cols)

        categorical_cols.update(new_email_cols)
        for col in email_cols:
            categorical_cols.remove(col)
        categorical_cols = list(categorical_cols)
        neptune.set_property('categorical_columns', str(categorical_cols))

        print('encoding categoricals')
        encoder = OrdinalEncoder(cols=categorical_cols).fit(
            train[ID_COLS + categorical_cols])
        train[ID_COLS + categorical_cols] = encoder.transform(
            train[ID_COLS + categorical_cols])
        test[ID_COLS + categorical_cols] = encoder.transform(
            test[ID_COLS + categorical_cols])

        train_features_path = os.path.join(
            FEATURES_DATA_PATH, 'train_features_{}.csv'.format(FEATURE_NAME))
        print('saving train to {}'.format(train_features_path))
        train.to_csv(train_features_path, index=None)
        log_data_version(train_features_path, prefix='train_features_')

        test_features_path = os.path.join(
            FEATURES_DATA_PATH, 'test_features_{}.csv'.format(FEATURE_NAME))
        print('saving test to {}'.format(test_features_path))
        test.to_csv(test_features_path, index=None)
        log_data_version(test_features_path, prefix='test_features_')
def get_categorical_pipeline():
  # Create the transformers for categorical features

    cat_features = [
    #('categoricals', 'passthrough', CAT_FEAT),
    ('binary', OrdinalEncoder(), 'ZoomInfo_Global_HQ_Country'),
    ('catboost', OrdinalEncoder(handle_unknown='value', handle_missing='value'), 'Adjusted_Industry'),
    ('ordinal',OrdinalEncoder(mapping=MAP_ORDINAL, handle_unknown='value'), ORDINAL_FEATURES),
    ]
    
    cat_ct = ColumnTransformer(cat_features)

    #Create the pipeline to transform categorical features
    cat_pipeline = Pipeline([
          ('cat_ct', cat_ct),
          #('ohe', OneHotEncoder(handle_unknown='ignore'))
      ])

    return cat_pipeline
Exemplo n.º 22
0
def process_data(X_train, X_test, X_val, X):
    """pre-process training data and transformation"""
    processor = make_pipeline(OrdinalEncoder(), SimpleImputer())
    X_train = processor.fit_transform(X_train)
    X_val = processor.transform(X_val)
    X_test = processor.transform(X_test)
    encoded_cols = list(range(0, X.shape[1]))
    column_names = list(X.columns)
    features = dict(zip(encoded_cols, column_names))
    return X_train, X_test, X_val, features, column_names, processor
Exemplo n.º 23
0
def out_of_folds_predict(X, y):
    callbacks = [
        EarlyStopping(
            # Stop training when loss is no longer improving
            monitor="loss",
            # "no longer improving" being defined as "no better than 1e-2 less"
            min_delta=1e-5,
            # "no longer improving" being further defined as "for at least 2 epochs"
            patience=2,
            verbose=0,
        )
    ]

    preds = np.zeros(X.shape[0])

    n_splits = 4

    if y.sum() < 2:
        kfold = KFold(n_splits=n_splits)
    else:
        kfold = StratifiedKFold(n_splits=n_splits)

    for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
        print(f'Split {i+1} of {n_splits}...')
        pipe = build_pipe()

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        encoder = OrdinalEncoder()
        X_train = encoder.fit_transform(X_train, y_train).astype(np.float)

        pipe.fit(X_train, y_train, epochs=20, callbacks=callbacks, verbose=0)

        X_test = encoder.transform(X_test).astype(np.float)
        pipe.evaluate(X_test, y_test, verbose=1)

        preds[test_index] = pipe.predict(X_test).flatten()

    pipe = build_pipe()

    return preds
Exemplo n.º 24
0
 def assign_cat_scaler(self,) :    
     self.cat_method = self.cat_info.get("method", None)
     self.cat_cols = self.cat_info.get("cols", [])
     if self.cat_method is None : 
         self.cat_encoder = Empty() 
     elif self.cat_method == "OrdinalEncoder" :
         self.cat_encoder = OrdinalEncoder(cols = self.cat_cols)
     elif self.cat_method == "OneHotEncoder" :
         self.cat_encoder = OneHotEncoder(cols = self.cat_cols)
     else :
         raise NotImplementedError("아직 나머지 구현 안함")
Exemplo n.º 25
0
def create_label(df, test_df, topic):
    result_dict = {}
    feature_df = df[['title']].copy()
    label_df = df.drop(columns=['itemid', 'title', 'image_path']).copy()

    feature_df['title'] = feature_df['title'].apply(lambda x: text_process(x))
    feature_array = feature_df['title'].values.tolist()
    feature_encoder = TfidfVectorizer()
    feature_encoder.fit(feature_array)
    feature_attr = feature_encoder.transform(feature_array)
    feature_decomposer = TruncatedSVD(500)
    feature_decomposer.fit(feature_attr)
    feature_attr = feature_decomposer.transform(feature_attr)

    test_df['title'] = test_df['title'].apply(lambda x: text_process(x))
    test_array = test_df['title'].values.tolist()
    test_attr = feature_encoder.transform(test_array)
    test_attr = feature_decomposer.transform(test_attr)

    train_itemid = df['itemid']
    test_itemid = test_df['itemid']

    result_dict['itemid_train_{}'.format(topic)] = train_itemid
    result_dict['itemid_test_{}'.format(topic)] = test_itemid
    result_dict['X_train_{}'.format(topic)] = feature_attr
    result_dict['X_encoder_{}'.format(topic)] = feature_encoder
    result_dict['X_decomposer_{}'.format(topic)] = feature_decomposer
    result_dict['X_test_{}'.format(topic)] = test_attr

    for column in label_df.columns:
        label_encoder = OrdinalEncoder(cols=[column], handle_unknown='impute')
        label_encoder.fit(label_df[[column]])
        label_attr = label_encoder.transform(label_df[[column]])

        result_dict['Y_train_{}_{}'.format(topic, column)] = label_attr
        result_dict['Y_encoder_{}_{}'.format(topic, column)] = label_encoder
        result_dict['Y_colname_{}_{}'.format(topic,
                                             column)] = label_attr.columns

    return result_dict
    def convert_meta_to_dict(self):
        meta = self.meta[['productid'] + self.META_COLS].copy()

        # Encode to int
        encoder = OrdinalEncoder(cols=self.META_COLS)
        meta = encoder.fit_transform(meta)
        save_model(encoder, '{}/encoder'.format(MODEL_PATH))

        meta['values'] = meta.apply(get_dict_values,
                                    args=(self.META_COLS, ),
                                    axis=1)
        meta_dict = meta.set_index('productid')['values'].to_dict()
        meta_dict = {self.word2id[k]: v for k, v in meta_dict.items()}

        meta_counts_dict = (
            meta[self.META_COLS].max() +
            1).to_dict()  # Need to +1 to account for index starting from zero
        # Without +1 the embedding size will be insufficient by 1
        ordered_meta_counts_dict = OrderedDict()
        for col in ['product'] + self.META_COLS:
            ordered_meta_counts_dict[col] = meta_counts_dict.get(col, 0)

        return meta_dict, ordered_meta_counts_dict
def encode_ordinal_df(dataframe, fit=False):
   """
    Encode ordinal features, preserving the notion of order and dropping invariant features
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataframe with pre-processed data (i.e. renamed features), ordinal features only
        fit: boolean
            Indicates if we should train or load an encoder
    Returns
        dataframe: pd.DataFrame
            Dataframe with encoded data
    """
    # Train or load an encoder    
    if fit:
        encoder = OrdinalEncoder(cols=dataframe.columns.values, drop_invariant=True)
        encoder.fit(dataframe)
        
        pickle_obj(encoder, 'ordinal_encoder')
    else:
        encoder = unpickle_obj('ordinal_encoder')

    # transform data
    return encoder.transform(dataframe)
Exemplo n.º 28
0
    def __init__(self, file_):
        data_set = pd.read_csv(file_, index_col=0)
        target = np.array(data_set["is_obesity"]).reshape(-1, 1).ravel()
        del data_set['IID']
        del data_set['log_BMI']
        del data_set['is_obesity']
        columns = "rs12620338,rs7559271,rs2234675,rs6436302,rs12053273,rs1430657,rs16863576,rs7589708,rs4674639,rs10932949,rs12995399,rs9768991,rs7809325,rs17879130,rs6964358,rs4724821,rs2410612,rs3816246,rs61734430,rs2651364,rs7963401,rs2733682,rs2651374,rs7132461,rs10771951,rs4931631,rs7299495,rs10844219,rs7311935,rs7963397,rs7295095,rs10844227,rs7977101,rs7966856,rs7967302,rs2088656,rs4931635,rs904582,rs10771966,rs6488068,rs7962152,rs4135048,rs4135060,rs3751209,rs140436257,rs4135113,rs4135126,rs2888805,rs2041794,rs2908792,rs12930428,rs2160290,rs4784311,rs13332406,rs76818213,rs1131220,rs3809634,rs3095631,rs17194040,rs1861556,rs16952304,rs7193898,rs1362572,rs12599436,rs1946155,rs4784320,rs12443767,rs3213758,rs17214955,rs8050354,rs139974543,rs2111119,rs2302677,rs9934800,rs5005161,rs7205986,rs1421084,rs7203521,rs6499640,rs4396532,rs1861868,rs1075440,rs13334933,rs9930333,rs9939973,rs9940128,rs1421085,rs16952520,rs1558902,rs10852521,rs1121980,rs7193144,rs17817449,rs11075987,rs8050136,rs9935401,rs9936385,rs9926289,rs76804286,rs9939609,rs9941349,rs7190492,rs9930506,rs9922708,rs9922619,rs8044769,rs12149832,rs10852523,rs3826169,rs10521307,rs17819033,rs7205009,rs2160481,rs4784329,rs7191718,rs9934504,rs9929152,rs12232391,rs9924072,rs12933996,rs17224310,rs17823199,rs7194907,rs6499662,rs12596210,rs8046658,rs7200972,rs9925908,rs12931859,rs7194243,rs4784351,rs2540781,rs856973,rs2003583,rs16953002,rs708258,rs1008400,rs11646512,rs11863548,rs2665271,rs2689264,rs8053279,rs8063722,rs879679,rs1610237,rs8054310,rs2542674,rs2689258,rs1033046,rs2010410,rs17835974,rs4783830,rs8060235,rs16953241,rs16953243,rs7200222,rs8049962,rs10521300,rs16953283,rs1126960,rs1868689,rs17176417,rs1079368,rs1004299,rs1004930,rs12930159,rs729633,rs8056104,rs2388632,rs7193399,rs11076030,rs12932839,rs7191827,rs8050506,rs11639567,rs17257349,rs7203944,rs1420303,rs1530793,rs4784379,rs7189231,rs9972796,rs1420285,rs4784390,rs12931301,rs12447674,rs9921518,rs4783845,rs17200070,rs11640012,rs12929998,rs733017,rs716083,rs751214,rs1362437,rs749622,rs8059628,rs1211435,rs1201336,rs1186817,rs1874025,rs8045161,rs8051442,rs1882591,rs1151277,rs11861365,rs2388773,rs1493897,rs8044756,rs1861532,rs11639521,rs17205999,rs16953856,rs1420562,rs2388807,rs1420553,rs1861538,rs4784415,rs12444481,rs1548912,rs7499390,rs4622506,rs4257585,rs4440156,rs7198507,rs9924618,rs11076057,rs4591143,rs6499720,rs4435250,rs4383140,rs4784429,rs4555155,rs9932117,rs11076060,rs12447300,rs13336114,rs1133611,rs11076063,rs11076064,rs8060082,rs4238773,rs12927600,rs4238775,rs13331158,rs4783863,rs8055853,rs4784467,rs6499743,rs16954195,rs4784474,rs1352191,rs7197624,rs11076070,rs8050248,rs1825730,rs16954308,rs11076076,rs4270172,rs8060698,rs12917822,rs8064192,rs1486735,rs1552426,rs7187108,rs8054239,rs11076081,rs2200537,rs9922031,rs1486733,rs12934198,rs2588996,rs2171262,rs17291845,rs7204268,rs2397376,rs9928598,rs12050985,rs4784510,rs1437449,rs16954658,rs991057,rs30922,rs30923,rs11860394,rs31045,rs31046,rs6499755,rs893263,rs31064,rs4784523,rs31103,rs31104,rs360774,rs30905,rs12918370,rs7199709,rs1370385,rs9926841,rs1610101,rs1420227,rs8045690,rs2540707,rs2576542,rs11643666,rs7184310,rs9936365,rs837537,rs7187242,rs7187258,rs11859163,rs17301608,rs2287074,rs7201,rs837550,rs2287072,rs112426189,rs3744374,rs12602590,rs11654604,rs200805689,rs117651561,rs79742527,rs143040759"
        preprocessor = ColumnTransformer(transformers=[
            ('encoder', OrdinalEncoder(), columns.split(',')),
        ])

        pipe = Pipeline([('preprocessor', preprocessor),
                         ('model', LogisticRegression())])
        self.classifier = pipe.fit(data_set, target)
        print('LogisticRegression score(accuracy) for ' + file_ + ' : ' + str(self.classifier.score(data_set, target)))
Exemplo n.º 29
0
def get_pipeline(est, is_tree, is_regressor, params):
    name = model_name(est)
    if name.startswith('Dummy'):
        ppl = Pipeline([
                       ('ft', FunctionTransformer()), 
                       ('mo', est)
                      ])
        params['ft__func'] = [lambda x:x[numeric_cols(x)]]
        params['ft__validate'] = [False]
    elif is_tree:
        ppl = Pipeline([
                       ('da', DateEncoder()),
                       ('du', OrdinalEncoder()),
                       ('ft', FunctionTransformer()),
                       ('se', SelectKBest2()),
                       ('mo', est)
                      ])
        params['da__ascategory'] = [False]
        params['du__drop_invariant'] = [True]
        params['ft__func'] = [lambda x:x.fillna(-999)]
        params['ft__validate'] = [False]
        params['se__score_func'] = get_selector(is_regressor, is_tree)
        params['se__k'] = [0.2, 0.5, 0.8, 1000, 1000]
    else:
        ppl = Pipeline([
                ('da', DateEncoder()),
                ('en', FeatureUnion([
                       ('nu', Pipeline([('ft', FunctionTransformer()), ('in', Imputer()), ('sc', TransformerWrap(StandardScaler()))])),
                       ('ca', Pipeline([('ft', FunctionTransformer()), ('sc', SparseCatEncoder())]))
                       ])),
                ('fu', FeatureUnion([('se', SelectKBest2()), ('dr', TruncatedSVD2())])),
                ('mo', est)
                ])
            
        params['en__nu__ft__func'] = [lambda x:x[numeric_cols(x)]]
        params['en__nu__ft__validate'] = [False]
        params['en__ca__ft__func'] = [lambda x:x[object_cols(x)]]
        params['en__ca__ft__validate'] = [False]
        params['fu__se__score_func'] = get_selector(is_regressor, is_tree)
        params['fu__se__k'] = [0.2, 0.5, 0.8, 1000]
        params['fu__dr__k'] = [0.2, 0.5, 0.8, 1000]        
        
    return name, ppl, params
Exemplo n.º 30
0
    def __init__(self, categorical_features, numerical_features, data, **kwargs):
        self.categorical_features = categorical_features
        self.numerical_features = numerical_features
        self.encoder = OrdinalEncoder(
            cols=self.categorical_features, return_df=True, handle_unknown="value"
        )
        self.scaler = StandardScaler()
        self.target_scaler = StandardScaler()

        self.model = None
        self.data = data
        self.inputs = []
        self.embeddings = []

        # for c in self.categorical_features:
        #     unique_values = data[c].unique()
        #     logger.info(f"{c} -- {unique_values}")

        self.build_full_network(data, **kwargs)