def fit(self, xtrain=None, ytrain=None):
        """
        Combination of cluster classifier and bagging on samples

        :param xtrain: pandas data_frame containing training data
        :param ytrain: numpy array with model targets
        :return: ClusteredXGBModel object
        """
        xtrain['target'] = ytrain

        for name, group in xtrain.groupby(self.split_feature):
            group_target = group['target'].values
            group_features = group.drop('target', axis=1)
            group_transformer = FraudFeatureExtractor()
            group_transformer.fit(group_features, group_target)
            group_features = group_transformer.transform(group_features)

            folds = KFold(n_splits=self._SPLITS, shuffle=True)
            for fold_n, (train_index, valid_index) in enumerate(folds.split(group_features)):
                x_train_, x_valid = group_features[train_index, :], group_features[valid_index, :]
                y_train_, y_valid = group_target[train_index], group_target[valid_index]
                eval_set = [(x_valid, y_valid)]
                estimator = XGBClassifier(n_estimators=1500,
                                          max_depth=9,
                                          learning_rate=0.048,
                                          subsample=0.85,
                                          colsample_bytree=0.85,
                                          reg_alpha=0.15,
                                          reg_lamdba=0.85,
                                          n_jobs=-1)
                estimator.fit(x_train_, y_train_, eval_metric=["error", "logloss"],
                              early_stopping_rounds=100, eval_set=eval_set, verbose=True)
                self.split_estimators[name].append(estimator)
                self.split_transformers[name].append(group_transformer)
        return self
class CatBoostModel(BaseEstimator, KaggleSubmitMixin):
    """
    Estimator using Catboost
    """
    _SEED = 1

    def __init__(self):
        self.fraud_transformer = FraudFeatureExtractor()
        self.classifier = None

    def fit(self, xtrain=None, ytrain=None, xval=None, yval=None):
        """
        Fit Catboost model

        :param xtrain: pandas data_frame for training
        :param ytrain: train target variable
        :param xval: pandas data_frame for model validation
        :param yval: target variable
        :return: CatBoostModel object
        """
        # If no validation data is specified, use training data
        if not any([xval, yval]):
            xval = xtrain
            yval = ytrain

        self.fraud_transformer.fit(xtrain, ytrain)
        xtrain = self.fraud_transformer.transform(xtrain)
        xval = self.fraud_transformer.transform(xval)

        nr_cat_features = int(numpy.sum(xtrain.max(axis=0) == 1))
        cat_features = list(range(nr_cat_features))
        train_data = Pool(data=xtrain,
                          label=ytrain,
                          cat_features=cat_features)
        valid_data = Pool(data=xval,
                          label=yval,
                          cat_features=cat_features)
        params = {'loss_function': 'Logloss',
                  'eval_metric': 'AUC',
                  'cat_features': cat_features,
                  'iterations': 2000,
                  'verbose': 10,
                  'max_depth': 7,
                  'random_seed': self._SEED,
                  'od_type': "Iter",
                  'od_wait': 100,
                  }
        self.classifier = CatBoostClassifier(**params)
        self.classifier.fit(train_data, eval_set=valid_data)
        return self

    def predict(self, xtest=None):
        """
        Apply trained classifier on test data

        :param xtest:  pandas data_frame for test data
        :return: array of predicted probability values
        """
        xtest = self.fraud_transformer.transform(xtest)
        return self.classifier.predict_proba(xtest)
 def __init__(self, n_estimators=500, split_feature='card6'):
     self.n_estimators = n_estimators
     self.classifier = None
     self.split_estimators = defaultdict(list)
     self.split_transformers = defaultdict(list)
     self.split_feature = split_feature
     self.fraud_transformer = FraudFeatureExtractor()
     self.estimators = []
 def __init__(self):
     self.early_stopping = EarlyStopping(**self.ES_PARAMS)
     self.model_checkpoint = ModelCheckpoint(**self.MC_PARAMS)
     self.transformer = FraudFeatureExtractor(with_embedding=True)
     self.inputs = []
     self.layers = []
     self.classifier = None
     self.model = None
     self.feature_category_mapper = None
     self.feature_mode = None
     self.embedding_output_mapper = None
class XGBModel(BaseEstimator, KaggleSubmitMixin):
    def __init__(self):
        self.fraud_transformer = FraudFeatureExtractor()
        self.classifier = None

    def fit(self, xtrain=None, ytrain=None, xval=None, yval=None):
        """
        Fit XGM based model

        :param xtrain: pandas data_frame for training
        :param ytrain: train target variable
        :param xval: pandas data_frame for model validation
        :param yval: target variable
        :return: XGBModel object
        """
        # If no validation data is specified, use training data
        if not any([xval, yval]):
            xval = xtrain.copy()
            yval = ytrain.copy()

        self.fraud_transformer.fit(xtrain, ytrain)
        xtrain = self.fraud_transformer.transform(xtrain)
        xval = self.fraud_transformer.transform(xval)
        eval_set = [(xval, yval)]
        self.classifier = XGBClassifier(n_estimators=2000,
                                        max_depth=9,
                                        learning_rate=0.048,
                                        subsample=0.9,
                                        colsample_bytree=0.9,
                                        reg_alpha=0.5,
                                        reg_lamdba=0.5,
                                        n_jobs=-1)
        self.classifier.fit(xtrain, ytrain, eval_metric=["error", "logloss"],
                            early_stopping_rounds=100, eval_set=eval_set, verbose=True)

    def predict(self, xtest=None):
        """
        Apply trained classifier on test data

        :param xtest: pandas data_frame for test data
        :return: array of predicted probability values
        """
        xtest = self.fraud_transformer.transform(xtest)
        return self.classifier.predict_proba(xtest)
class BaggedRFModel(BaseEstimator, KaggleSubmitMixin):
    """
    Bagging model with Randomforest as base estimator
    """
    rf_params = {'n_estimators': 500,
                 'max_depth': 9,
                 'n_jobs': -1}

    def __init__(self, n_estimators_bagging=10):
        self.n_estimators_bagging = n_estimators_bagging
        self.fraud_transformer = FraudFeatureExtractor()
        self.classifier = None

    def fit(self, xtrain=None, ytrain=None):
        """
        Fit bagging model with Random Forest based estimator

        :param xtrain: pandas data_frame for training
        :param ytrain: train target variable
        :return: BaggedRFModel objects
        """
        self.fraud_transformer.fit(xtrain, ytrain)
        xtrain = self.fraud_transformer.transform(xtrain)
        estimator = RandomForestClassifier(**self.rf_params)

        self.classifier = BaggingClassifier(base_estimator=estimator, n_estimators=self.n_estimators_bagging)
        self.classifier.fit(xtrain, ytrain)
        return self

    def predict(self, xtest=None):
        """
        Apply trained classifier on test data

        :param xtest: pandas data_frame for test data
        :return: array of predicted probability values

        """
        xtest = self.fraud_transformer.transform(xtest)
        return self.classifier.predict_proba(xtest)
 def __init__(self):
     self.fraud_transformer = FraudFeatureExtractor()
     self.classifier = None
 def __init__(self, n_estimators_bagging=10):
     self.n_estimators_bagging = n_estimators_bagging
     self.fraud_transformer = FraudFeatureExtractor()
     self.classifier = None
class KFoldModel(BaseEstimator, KaggleSubmitMixin):
    """
    Apply models on different fold splits of the data

    """
    _SEED = 1
    _SPLITS = 5

    def __init__(self, n_estimators=500, split_feature='card6'):
        self.n_estimators = n_estimators
        self.classifier = None
        self.split_estimators = defaultdict(list)
        self.split_transformers = defaultdict(list)
        self.split_feature = split_feature
        self.fraud_transformer = FraudFeatureExtractor()
        self.estimators = []

    def fit(self, xtrain=None, ytrain=None):
        """
        Fits _SPLITS number of models

        :param xtrain: pandas data_frame for training
        :param ytrain: train target variable
        :param xval: pandas data_frame for model validation
        :param yval: target variable
        :return: KFoldModel object
        """
        self.fraud_transformer.fit(xtrain, ytrain)
        xtrain = self.fraud_transformer.transform(xtrain)

        folds = KFold(n_splits=self._SPLITS, shuffle=True)
        for fold_n, (train_index, valid_index) in enumerate(folds.split(xtrain)):
            self.classifier = XGBClassifier(n_estimators=10,
                                            max_depth=9,
                                            learning_rate=0.048,
                                            subsample=0.85,
                                            colsample_bytree=0.85,
                                            reg_alpha=0.15,
                                            reg_lamdba=0.85,
                                            n_jobs=-1)
            x_train_, x_valid = xtrain[train_index, :], xtrain[valid_index, :]
            y_train_, y_valid = ytrain[train_index], ytrain[valid_index]
            eval_set = [(x_valid, y_valid)]
            fit_params = {'eval_metric': ["error", "logloss"],
                          'early_stopping_rounds': 100,
                          'eval_set': eval_set,
                          'verbose': True}
            self.classifier.fit(x_train_, y_train_, **fit_params)
            self.estimators.append(self.classifier)
        return self

    def predict(self, xtest=None):
        """
        Apply trained classifier on test data

        :param xtest: Test data
        :return: array of predicted probability values

        """
        xtest = self.fraud_transformer.transform(xtest)
        pred = numpy.zeros((xtest.shape[0], 2))
        for clf in self.estimators:
            pred += clf.predict_proba(xtest)/self._SPLITS
        return pred
class EmbeddingBasedClassifier:
    """
    Model based on embedding layers for categorical features

    """

    EPOCHS = 10
    OPTIMIZER = optimizers.SGD(lr=0.03, nesterov=True)
    EMBEDDING_RATIO = 0.25
    MAX_EMBEDDING = 50
    BATCH_SIZE = 256
    VERBOSE = 0

    ES_PARAMS = {
        'monitor': 'val_loss',
        'mode': 'min',
        'verbose': 1,
        'patience': 20
    }

    MC_PARAMS = {
        'filepath': '../data/saved_models/best_model.h5',
        'monitor': 'val_loss',
        'mode': 'min',
        'verbose': 1,
        'save_best_only': True
    }

    def __init__(self):
        self.early_stopping = EarlyStopping(**self.ES_PARAMS)
        self.model_checkpoint = ModelCheckpoint(**self.MC_PARAMS)
        self.transformer = FraudFeatureExtractor(with_embedding=True)
        self.inputs = []
        self.layers = []
        self.classifier = None
        self.model = None
        self.feature_category_mapper = None
        self.feature_mode = None
        self.embedding_output_mapper = None

    def embedding_mapper(self, data_frame=None):
        """
        Extract candidate embedding features and create mapper of number unique entries and mapper for output dimension

        :param data_frame: data frame
        :return: tuple (list of embedding features, dictionary with nr unique entries, output dimension)
        """
        cat_data_frame = TypeSelector(type='object').transform(data_frame)

        numeric_features = list(
            data_frame.dtypes[data_frame.dtypes == 'float64'].index)
        categoric_features = list(cat_data_frame.columns)
        embedding_features = list(
            cat_data_frame.loc[:, cat_data_frame.nunique() > 2].columns)

        # remove embedding features from categoric features
        categoric_features = set(categoric_features) - set(embedding_features)

        feature_mode = {
            feature: cat_data_frame[feature].nunique() + 1
            for feature in embedding_features
        }
        embedding_output_mapper = {
            feature: min(
                int(feature_mode[feature] * self.EMBEDDING_RATIO) + 2,
                self.MAX_EMBEDDING)
            for feature in embedding_features
        }

        feature_category_mapper = {
            'numeric_features': numeric_features,
            'categoric_features': list(categoric_features),
            'embedding_features': embedding_features
        }

        return feature_category_mapper, feature_mode, embedding_output_mapper

    @staticmethod
    def preproc_embedding_layer(data_frame=None, feature_category_mapper=None):
        """
        Creates new unique ordinal mapping to feed to embedding layer and create proper format for Keras model

        :param data_frame: pandas data frame
        :param feature_category_mapper:
        :return: list of preprocessed data frames
        """
        unique_values = {
            feature: data_frame[feature].unique()
            for feature in feature_category_mapper['embedding_features']
        }
        value_mapper = {
            ek: dict(map(lambda x: (x[1], x[0]), enumerate(unique_values[ek])))
            for ek in feature_category_mapper['embedding_features']
        }

        preproc_embedding = [
            data_frame[c].map(value_mapper[c]).values
            for c in feature_category_mapper['embedding_features']
        ]
        preproc_categorical = data_frame.loc[:, feature_category_mapper[
            'categoric_features']].values
        preproc_numerical = data_frame.loc[:, feature_category_mapper[
            'numeric_features']].values

        # unpack
        train_data = []
        for pe in preproc_embedding:
            train_data.append(pe)
        train_data.append(preproc_categorical)
        train_data.append(preproc_numerical)

        return train_data

    @staticmethod
    def create_embedding_layer(n_unique=None, output_dim=None, input_length=1):
        """
        Creates embedding layers

        :param n_unique: dimension of unique labels
        :param output_dim: dimension of embedding matrix
        :param input_length: default 1
        :return: input data info, embedding layer info
        """
        _input = Input(shape=(1, ))
        _embedding = Embedding(n_unique, output_dim,
                               input_length=input_length)(_input)
        _embedding = Reshape(target_shape=(output_dim, ))(_embedding)
        return _input, _embedding

    def load_data(self):
        """
        Loads data

        :return:
        """

    def build_network(self,
                      feature_category_mapper=None,
                      feature_mode=None,
                      embedding_output_mapper=None):
        """
        Build up network with all embedding and other(numeric) layers

        :param feature_category_mapper:
        :param feature_mode:
        :param embedding_output_mapper:
        :return: compiled keras model
        """
        # add embedding layers
        if feature_category_mapper['embedding_features'] is not None:
            for feature in feature_category_mapper['embedding_features']:
                embedding_input, embedding_layer = self.create_embedding_layer(
                    feature_mode[feature], embedding_output_mapper[feature])
                self.inputs.append(embedding_input)
                self.layers.append(embedding_layer)

        # add layer for other categoric features that are not embedding features
        if feature_category_mapper['categoric_features'] is not None:
            categorical_input = Input(
                shape=(len(feature_category_mapper['categoric_features']), ))
            categoric_layer = Dense(50)(categorical_input)
            self.inputs.append(categorical_input)
            self.layers.append(categoric_layer)

        # add layer for other numeric features
        if feature_category_mapper['numeric_features'] is not None:
            numeric_input = Input(
                shape=(len(feature_category_mapper['numeric_features']), ))
            numeric_layer = Dense(50)(numeric_input)
            self.inputs.append(numeric_input)
            self.layers.append(numeric_layer)

        x = Concatenate()(self.layers)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.05)(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.1)(x)
        x = Dense(10, activation='relu')(x)
        output = Dense(1, activation='sigmoid')(x)

        model = Model(self.inputs, output)
        model.compile(loss='binary_crossentropy',
                      optimizer=self.OPTIMIZER,
                      metrics=['acc'])
        return model

    def fit(self, x_train=None, y_train=None, x_val=None, y_val=None):
        """
        Fit neural net model

        :param x_train: pandas data_frame
        :param y_train: training targets
        :param x_val: pandas data_frame
        :param y_val: validation targets
        :return:
        """
        self.transformer.fit(x_train, y_train)
        x_train = self.transformer.transform(x_train)
        x_val = self.transformer.transform(x_val)

        # have to convert numpy to pandas
        onehot_pipe = self.transformer.mapper.transformer_list[0][1][1]
        embedding_pipe = self.transformer.mapper.transformer_list[1][1][1]
        self.nr_cat_features = sum([len(elem) for elem in onehot_pipe.categories_]) + \
                               len(embedding_pipe.embedding_candidates)

        x_train, x_val = cast_to_type(x_train, x_val, self.nr_cat_features)

        self.feature_category_mapper, self.feature_mode, self.embedding_output_mapper = self.embedding_mapper(
            x_train)

        self.classifier = self.build_network(self.feature_category_mapper,
                                             self.feature_mode,
                                             self.embedding_output_mapper)

        x_train_preproc = self.preproc_embedding_layer(
            x_train, self.feature_category_mapper)
        x_val_preproc = self.preproc_embedding_layer(
            x_val, self.feature_category_mapper)

        params = {
            'x': x_train_preproc,
            'y': y_train,
            'validation_data': (x_val_preproc, y_val),
            'batch_size': self.BATCH_SIZE,
            'epochs': self.EPOCHS,
            'verbose': self.VERBOSE,
            'callbacks': [self.early_stopping, self.model_checkpoint]
        }

        self.classifier.fit(**params)

    def predict(self, x_test=None):
        """
        Loads best model for prediction

        :param x_test:
        :return:
        """
        x_test = self.transformer.transform(x_test)
        x_test, _ = cast_to_type(x_test, x_test, self.nr_cat_features)

        x_test_preproc = self.preproc_embedding_layer(
            x_test, self.feature_category_mapper)
        saved_model = load_model(self.MC_PARAMS['filepath'])
        return saved_model.predict(x_test_preproc)

    def submit(self, xtest=None, trans_ids=None):
        """
        Saves prediction outputs in kaggle submission file format

        :param xtest: competition test data
        :param trans_ids: transaction ids
        :return: saved kaggle file format
        """
        test_predictions = self.predict(xtest)
        my_submission_file = pandas.DataFrame()
        my_submission_file['TransactionID'] = trans_ids
        my_submission_file['isFraud'] = test_predictions
        my_submission_file.to_csv('../data/output_data/submission.csv',
                                  index=False)