示例#1
0
def test():
    df = pd.DataFrame(np.random.randint(0, 1000, size=(1000000, 10)),
                      columns=['c{}'.format(x) for x in range(10)])
    profiler = cProfile.Profile(subcalls=True, builtins=True, timeunit=.001)
    lbe = LabelEncoder(min_obs=100)
    profiler.enable()
    lbe.fit(df)
    X_new = lbe.transform(df)
    profiler.disable()
    profiler.print_stats()
示例#2
0
def test():
    df = pd.DataFrame(
        np.random.randint(0, N_CATEGORY, size=(N_OBS, N_FEATURE)),
        columns=["c{}".format(x) for x in range(N_FEATURE)],
    )
    profiler = cProfile.Profile(subcalls=True, builtins=True, timeunit=0.001)
    lbe = LabelEncoder(min_obs=100)
    profiler.enable()
    lbe.fit(df)
    _ = lbe.transform(df)
    profiler.disable()
    profiler.print_stats()
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)

    trn['date'] = trn.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y'))
    tst['date'] = tst.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y'))

    trn['year_2017'] = trn.date.apply(lambda x: x.year - 2016)
    tst['year_2017'] = tst.date.apply(lambda x: x.year - 2016)

    trn['month'] = trn.date.apply(lambda x: x.month)
    tst['month'] = tst.date.apply(lambda x: x.month)

    y = trn.target.values

    n_trn = trn.shape[0]

    trn.drop(['target', 'date', 'f_19'], axis=1, inplace=True)
    tst.drop(['id', 'date', 'f_19'], axis=1, inplace=True)

    cat_cols = ['customer_id'
                ] + [x for x in trn.columns if trn[x].dtype == np.object]
    num_cols = [x for x in trn.columns if trn[x].dtype != np.object]

    logging.info('categorical: {}, numerical: {}'.format(
        len(cat_cols), len(num_cols)))

    logging.info('label encoding categorical variables')
    lbe = LabelEncoder(min_obs=10)
    trn.ix[:, cat_cols] = lbe.fit_transform(trn[cat_cols].values)
    tst.ix[:, cat_cols] = lbe.transform(tst[cat_cols].values)

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(trn.columns):
            f.write('{}\t{}\tq\n'.format(i, col))

    logging.info('saving features')
    save_data(trn.values.astype(float), y, train_feature_file)
    save_data(tst.values.astype(float), None, test_feature_file)
class Model:
    def __init__(self,datainfo,timeinfo):
        '''
        This constructor is supposed to initialize data members.
        Use triple quotes for function documentation.
        '''
        # Just logging.info some info from the datainfo variable
        logging.info("The Budget for this data set is: %d seconds" %datainfo['time_budget'])

        logging.info("Loaded %d time features, %d numerical Features, %d categorical features and %d multi valued categorical variables" %(datainfo['loaded_feat_types'][0], datainfo['loaded_feat_types'][1],datainfo['loaded_feat_types'][2],datainfo['loaded_feat_types'][3]))
        overall_spenttime=time.time()-timeinfo[0]
        dataset_spenttime=time.time()-timeinfo[1]
        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)
        self.num_train_samples=0
        self.num_feat=1
        self.num_labels=1
        self.is_trained=False
        self.clf = LGBMClassifier(**params)
        # Here you may have parameters and hyper-parameters

    def fit(self, F, y, datainfo,timeinfo):
        '''
        This function should train the model parameters.
        Here we do nothing in this example...
        Args:
            X: Training data matrix of dim num_train_samples * num_feat.
            y: Training label matrix of dim num_train_samples * num_labels.
        Both inputs are numpy arrays.
        If fit is called multiple times on incremental data (train, test1, test2, etc.)
        you should warm-start your training from the pre-trained model. Past data will
        NOT be available for re-training.
        '''

        overall_spenttime=time.time()-timeinfo[0]
        dataset_spenttime=time.time()-timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        self.X = np.nan_to_num(F['numerical'])
        self.y = y

        # Frequency encode categorical variables and concatenate them with numerical variables
        if categorical_cols > 0:
            self.cat_encs = LabelEncoder()
            X_cat = self.cat_encs.fit_transform(F['CAT']).values
            self.X = np.concatenate((self.X, X_cat), axis=1)
            del X_cat

        self.num_train_samples = self.X.shape[0]
        self.num_feat = self.X.shape[1]
        num_train_samples = y.shape[0]

        logging.info ("The whole available data is: ")
        logging.info(("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.X.shape[0],self.X.shape[1]))
        logging.info(("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.y.shape[0], self.num_labels))

        self.is_trained=True

    def predict(self, F,datainfo,timeinfo):
        '''
        This function should provide predictions of labels on (test) data.
        Here we just return random values...
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves.
        The function predict eventually casdn return probabilities or continuous values.
        '''

        overall_spenttime=time.time()-timeinfo[0]
        dataset_spenttime=time.time()-timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        X = np.nan_to_num(F['numerical'])

        # Frequency encode categorical variables and concatenate them with numerical variables
        if categorical_cols > 0:
            X_cat = self.cat_encs.transform(F['CAT']).values
            X = np.concatenate((X, X_cat), axis=1)
            del X_cat

        # Adversarial validation
        logging.info('AV: starting adversarial validation...')

        np.random.seed(SEED)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

        n_trn = self.X.shape[0]
        n_tst = X.shape[0]
        n_feature = X.shape[1]

        X_all = np.vstack((self.X, X))
        y_all = np.concatenate((np.zeros(n_trn,), np.ones(n_tst,)))
        logging.info(f'AV: {X_all.shape}, {y_all.shape}')
        logging.info(f'AV: {np.unique(y_all)}')

        av_auc = 1.
        cols = np.arange(n_feature)
        count = 0
        av_auc_threshold = .8
        while av_auc > av_auc_threshold:
            model_av = RandomForestClassifier(min_samples_leaf=20,
                                              min_impurity_decrease=.01,
                                              random_state=SEED)
            model_av.fit(X_all[:, cols], y_all)

            ps_all = model_av.predict_proba(X_all[:, cols])[:, 1]
            av_auc = roc_auc_score(y_all, ps_all)
            logging.info(f'AV #{count}: AUC={av_auc * 100: 3.2f}')

            imp = pd.DataFrame({'feature': cols,
                                'importance': model_av.feature_importances_})
            imp = imp.sort_values('importance', ascending=False)
            logging.info(f'AV #{count}: feature importance\n{imp.head(10)}')

            # Select features
            cols_to_drop = imp.loc[imp.importance > GINI_THRESHOLD, 'feature'].values[:int(np.ceil(len(cols) * .1))]
            logging.info(f'AV #{count}: columns to drop: {cols_to_drop}')
            if av_auc <= av_auc_threshold or len(cols_to_drop) == 0:
                break

            cols = [x for x in cols if x not in cols_to_drop]
            logging.info(f'AV #{count}: columns to keep: {cols}')
            count += 1

        X = X[:, cols]
        self.X = self.X[:, cols]
        logging.info(f'AV: # of features after selection: {X.shape[1]}')

        # Training
        X_trn, X_val, y_trn, y_val = train_test_split(self.X, self.y, test_size=.25, random_state=SEED)
        self.clf.fit(X_trn, y_trn,
                     eval_set=(X_val, y_val),
                     early_stopping_rounds=10,
                     verbose=10)

        num_test_samples = X.shape[0]
        if X.ndim > 1: num_feat = X.shape[1]
        logging.info(("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat))
        logging.info(("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels))
        y= self.clf.predict_proba(X)[:, 1]
        y= np.transpose(y)
        return y

    def save(self, path="./"):
        pickle.dump(self, open(path + '_model.pickle', "w"))

    def load(self, path="./"):
        modelfile = path + '_model.pickle'
        if isfile(modelfile):
            with open(modelfile) as f:
                self = pickle.load(f)
            logging.info("Model reloaded from: " + modelfile)
        return self
示例#5
0
class Model:
    def __init__(self,datainfo,timeinfo):
        '''
        This constructor is supposed to initialize data members.
        Use triple quotes for function documentation.
        '''
        # Just logging.info some info from the datainfo variable
        logging.info("The Budget for this data set is: %d seconds" %datainfo['time_budget'])

        logging.info("Loaded %d time features, %d numerical Features, %d categorical features and %d multi valued categorical variables" %(datainfo['loaded_feat_types'][0], datainfo['loaded_feat_types'][1],datainfo['loaded_feat_types'][2],datainfo['loaded_feat_types'][3]))
        overall_spenttime=time.time()-timeinfo[0]
        dataset_spenttime=time.time()-timeinfo[1]
        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)
        self.num_train_samples=0
        self.num_feat=1
        self.num_labels=1
        self.is_trained=False
        self.clf = LGBMClassifier(**params)
        # Here you may have parameters and hyper-parameters

    def fit(self, F, y, datainfo,timeinfo):
        '''
        This function should train the model parameters.
        Here we do nothing in this example...
        Args:
            X: Training data matrix of dim num_train_samples * num_feat.
            y: Training label matrix of dim num_train_samples * num_labels.
        Both inputs are numpy arrays.
        If fit is called multiple times on incremental data (train, test1, test2, etc.)
        you should warm-start your training from the pre-trained model. Past data will
        NOT be available for re-training.
        '''

        overall_spenttime=time.time()-timeinfo[0]
        dataset_spenttime=time.time()-timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        X = np.nan_to_num(F['numerical'])

        # Frequency encode categorical variables and concatenate them with numerical variables
        if categorical_cols > 0:
            self.cat_encs = LabelEncoder()
            X_cat = self.cat_encs.fit_transform(F['CAT']).values
            X = np.concatenate((X, X_cat), axis=1)
            del X_cat

        self.num_train_samples = X.shape[0]
        self.num_feat = X.shape[1]
        num_train_samples = y.shape[0]

        self.DataX = X
        self.DataY = y
        logging.info ("The whole available data is: ")
        logging.info(("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.DataX.shape[0],self.DataX.shape[1]))
        logging.info(("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.DataY.shape[0], self.num_labels))

        X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=.25, random_state=SEED)
        self.clf.fit(X_trn, y_trn,
                     eval_set=(X_val, y_val),
                     early_stopping_rounds=10,
                     verbose=10)

        if (self.num_train_samples != num_train_samples):
            logging.info("ARRGH: number of samples in X and y do not match!")
        self.is_trained=True

    def predict(self, F,datainfo,timeinfo):
        '''
        This function should provide predictions of labels on (test) data.
        Here we just return random values...
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves.
        The function predict eventually casdn return probabilities or continuous values.
        '''

        overall_spenttime=time.time()-timeinfo[0]
        dataset_spenttime=time.time()-timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        X = np.nan_to_num(F['numerical'])

        # Frequency encode categorical variables and concatenate them with numerical variables
        if categorical_cols > 0:
            X_cat = self.cat_encs.transform(F['CAT']).values
            X = np.concatenate((X, X_cat), axis=1)
            del X_cat

        num_test_samples = X.shape[0]
        if X.ndim > 1: num_feat = X.shape[1]
        logging.info(("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat))
        if (self.num_feat != num_feat):
            logging.info("ARRGH: number of features in X does not match training data!")
        logging.info(("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels))
        y= self.clf.predict_proba(X)[:, 1]
        y= np.transpose(y)
        return y

    def save(self, path="./"):
        pickle.dump(self, open(path + '_model.pickle', "w"))

    def load(self, path="./"):
        modelfile = path + '_model.pickle'
        if isfile(modelfile):
            with open(modelfile) as f:
                self = pickle.load(f)
            logging.info("Model reloaded from: " + modelfile)
        return self
示例#6
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file, cv_id_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)

    y = trn.target.values

    logging.info('converting the date column into datetime')
    trn['date'] = pd.to_datetime(trn.date, format='%m%d%Y')
    tst['date'] = pd.to_datetime(tst.date, format='%m%d%Y')

    logging.info('add year and month features')
    trn['year_2017'] = trn.date.dt.year - 2016
    tst['year_2017'] = tst.date.dt.year - 2016

    logging.info('add the month feature')
    trn['month'] = trn.date.dt.month
    tst['month'] = tst.date.dt.month

    logging.info('splitting customer_ids into first 5 and next 3 digits')
    trn['cid_5'] = trn.customer_id // 1e7
    tst['cid_5'] = tst.customer_id // 1e7
    trn['cid_3'] = (trn.customer_id // 1e4) % 1e3
    tst['cid_3'] = (tst.customer_id // 1e4) % 1e3

    logging.info(
        'adding a flag to indicate if a customer_id exists in both training and test data'
    )
    trn['cid_both'] = trn.customer_id.isin(tst.customer_id.tolist()).astype(
        np.int64)
    tst['cid_both'] = tst.customer_id.isin(trn.customer_id.tolist()).astype(
        np.int64)

    logging.info('combining cid_5, month, and market')
    trn['cid_5_month_market'] = trn.cid_5 * 1e4 + trn.month * 100 + trn.market.str[
        1:].astype(int)
    tst['cid_5_month_market'] = tst.cid_5 * 1e4 + tst.month * 100 + tst.market.str[
        1:].astype(int)

    logging.info('combining cid_3, month, and market')
    trn['cid_3_month_market'] = trn.cid_3 * 1e4 + trn.month * 100 + trn.market.str[
        1:].astype(int)
    tst['cid_3_month_market'] = tst.cid_3 * 1e4 + tst.month * 100 + tst.market.str[
        1:].astype(int)

    logging.info('drop unused columns')
    trn.drop(COLS_TO_DROP, axis=1, inplace=True)
    tst.drop(['id'] + COLS_TO_DROP, axis=1, inplace=True)

    cat_cols = [
        'customer_id', 'cid_5', 'cid_3', 'cid_5_month_market',
        'cid_3_month_market'
    ]
    cat_cols += [x for x in trn.columns if trn[x].dtype == np.object]
    float_cols = [x for x in trn.columns if trn[x].dtype == np.float64]
    int_cols = [
        x for x in trn.columns if x not in ['target'] + cat_cols + float_cols
    ]

    logging.info('categorical: {}, float: {}, int: {}'.format(
        len(cat_cols), len(float_cols), len(int_cols)))

    logging.info('label encoding categorical variables')
    lbe = LabelEncoder(min_obs=10)
    trn.ix[:, cat_cols] = lbe.fit_transform(trn[cat_cols].values)
    tst.ix[:, cat_cols] = lbe.transform(tst[cat_cols].values)

    logging.info('min-max scaling float columns')
    scaler = MinMaxScaler()
    trn.ix[:, float_cols] = scaler.fit_transform(trn[float_cols].values)
    tst.ix[:, float_cols] = scaler.transform(tst[float_cols].values)

    logging.info('adding interactions with f_5')
    interaction_cols = [
        'f_8', 'f_12', 'f_18', 'f_11', 'f_13', 'f_21', 'f_15', 'f_26'
    ]

    feature_cols = cat_cols + float_cols + int_cols
    for col in interaction_cols:
        trn['f_5+{}'.format(col)] = trn.f_5 * 10 + trn[col]
        tst['f_5+{}'.format(col)] = tst.f_5 * 10 + tst[col]
        feature_cols.append('f_5+{}'.format(col))

    for col1, col2 in combinations(interaction_cols, 2):
        logging.info('adding interactions between {} and {}'.format(
            col1, col2))
        trn['{}+{}'.format(col1, col2)] = trn[col1] + trn[col2]
        tst['{}+{}'.format(col1, col2)] = tst[col1] + tst[col2]

        trn['{}-{}'.format(col1, col2)] = trn[col1] - trn[col2]
        tst['{}-{}'.format(col1, col2)] = tst[col1] - tst[col2]

        trn['{}x{}'.format(
            col1,
            col2)] = trn[col1].apply(np.log1p) + trn[col2].apply(np.log1p)
        tst['{}x{}'.format(
            col1,
            col2)] = tst[col1].apply(np.log1p) + tst[col2].apply(np.log1p)

        trn['{}/{}'.format(
            col1,
            col2)] = trn[col1].apply(np.log1p) - trn[col2].apply(np.log1p)
        tst['{}/{}'.format(
            col1,
            col2)] = tst[col1].apply(np.log1p) - tst[col2].apply(np.log1p)

        feature_cols += [
            '{}+{}'.format(col1, col2), '{}-{}'.format(col1, col2),
            '{}x{}'.format(col1, col2), '{}/{}'.format(col1, col2)
        ]

    logging.info('generate CV features')
    feature_name, feature_ext = os.path.splitext(train_feature_file)
    feature_name = os.path.splitext(feature_name)[0]

    logging.info('Loading CV Ids')
    cv_id = np.loadtxt(cv_id_file)

    for i in range(1, N_FOLD + 1):
        i_trn = np.where(cv_id != i)[0]
        i_val = np.where(cv_id == i)[0]

        cv_feature_cols = []
        logging.info(
            'mean-target encoding for categorical columns for CV #{}'.format(
                i))
        cv_trn = trn[cat_cols + [TARGET]].copy()
        cv_tst = tst[cat_cols].copy()
        for col in cat_cols:
            mean_target = cv_trn.iloc[i_trn][[col,
                                              'target']].groupby(col).mean()
            mapping = mean_target.to_dict()['target']
            cv_trn[col] = cv_trn[col].map(mapping)
            cv_tst[col] = cv_tst[col].map(mapping)

        cv_feature_cols += cat_cols

        logging.info(
            'adding min, max, mean of mean-target encodings of categorical columns'
        )
        cv_trn['min_target_encoding'] = cv_trn[cat_cols].min(axis=1)
        cv_trn['max_target_encoding'] = cv_trn[cat_cols].max(axis=1)
        cv_trn['median_target_encoding'] = cv_trn[cat_cols].median(axis=1)
        cv_tst['min_target_encoding'] = cv_tst[cat_cols].min(axis=1)
        cv_tst['max_target_encoding'] = cv_tst[cat_cols].max(axis=1)
        cv_tst['median_target_encoding'] = cv_tst[cat_cols].median(axis=1)

        cv_feature_cols += [
            'min_target_encoding', 'max_target_encoding',
            'median_target_encoding'
        ]

        logging.info('saving features for CV #{}'.format(i))
        save_data(cv_trn[cv_feature_cols].values.astype(float), y,
                  '{}.trn{}{}'.format(feature_name, i, feature_ext))
        save_data(cv_tst[cv_feature_cols].values.astype(float), None,
                  '{}.tst{}{}'.format(feature_name, i, feature_ext))

    logging.info('saving non-CV features')
    save_data(trn[feature_cols].values.astype(float), y, train_feature_file)
    save_data(tst[feature_cols].values.astype(float), None, test_feature_file)

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(feature_cols + cv_feature_cols):
            f.write('{}\t{}\tq\n'.format(i, col))
示例#7
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)

    logging.info('converting the date column into datetime')
    trn['date'] = trn.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y'))
    tst['date'] = tst.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y'))

    logging.info('add year and month features')
    trn['year_2017'] = trn.date.dt.year - 2016
    tst['year_2017'] = tst.date.dt.year - 2016

    trn['month'] = trn.date.dt.month
    tst['month'] = tst.date.dt.month

    y = trn.target.values

    n_trn = trn.shape[0]

    logging.info('splitting customer ids into first 8 digits')
    trn['cid_8'] = trn.customer_id // 10000
    tst['cid_8'] = tst.customer_id // 10000

    logging.info('drop unused columns')
    trn.drop(['target', 'date', 'f_19', 'customer_id'], axis=1, inplace=True)
    tst.drop(['id', 'date', 'f_19', 'customer_id'], axis=1, inplace=True)

    cat_cols = ['cid_8'
                ] + [x for x in trn.columns if trn[x].dtype == np.object]
    float_cols = [x for x in trn.columns if trn[x].dtype == np.float64]
    int_cols = [
        x for x in trn.columns if (trn[x].dtype == np.int64) & (x != 'cid_8')
    ]

    logging.info('categorical: {}, float: {}, int: {}'.format(
        len(cat_cols), len(float_cols), len(int_cols)))

    logging.info('label encoding categorical variables')
    lbe = LabelEncoder(min_obs=10)
    trn.ix[:, cat_cols] = lbe.fit_transform(trn[cat_cols].values)
    tst.ix[:, cat_cols] = lbe.transform(tst[cat_cols].values)

    logging.info('min-max scaling float columns')
    scaler = MinMaxScaler()
    trn.ix[:, float_cols] = scaler.fit_transform(trn[float_cols].values)
    tst.ix[:, float_cols] = scaler.transform(tst[float_cols].values)

    logging.info('adding interactions')
    trn['f_5+f_21'] = trn.f_5 + trn.f_21
    tst['f_5+f_21'] = tst.f_5 + tst.f_21
    float_cols.append('f_5+f_21')

    interaction_cols = ['f_13', 'f_21', 'f_15', 'f_26']
    for col1, col2 in combinations(interaction_cols, 2):
        logging.info('adding interactions between {} and {}'.format(
            col1, col2))
        trn['{}+{}'.format(col1, col2)] = trn[col1] + trn[col2]
        tst['{}+{}'.format(col1, col2)] = tst[col1] + tst[col2]

        trn['{}-{}'.format(col1, col2)] = trn[col1] - trn[col2]
        tst['{}-{}'.format(col1, col2)] = tst[col1] - tst[col2]

        trn['{}x{}'.format(
            col1,
            col2)] = trn[col1].apply(np.log1p) + trn[col2].apply(np.log1p)
        tst['{}x{}'.format(
            col1,
            col2)] = tst[col1].apply(np.log1p) + tst[col2].apply(np.log1p)

        trn['{}/{}'.format(
            col1,
            col2)] = trn[col1].apply(np.log1p) - trn[col2].apply(np.log1p)
        tst['{}/{}'.format(
            col1,
            col2)] = tst[col1].apply(np.log1p) - tst[col2].apply(np.log1p)

        float_cols += [
            '{}+{}'.format(col1, col2), '{}-{}'.format(col1, col2),
            '{}x{}'.format(col1, col2), '{}/{}'.format(col1, col2)
        ]

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(trn.columns):
            if col in cat_cols + int_cols:
                f.write('{}\t{}\tint\n'.format(i, col))
            else:
                f.write('{}\t{}\tq\n'.format(i, col))

    logging.info('saving features')
    save_data(trn.values.astype(float), y, train_feature_file)
    save_data(tst.values.astype(float), None, test_feature_file)
class Model:
    def __init__(self, datainfo, timeinfo):
        '''
        This constructor is supposed to initialize data members.
        Use triple quotes for function documentation.
        '''
        # Just logging.info some info from the datainfo variable
        logging.info("The Budget for this data set is: %d seconds" %
                     datainfo['time_budget'])

        logging.info(
            "Loaded %d time features, %d numerical Features, %d categorical features and %d multi valued categorical variables"
            % (datainfo['loaded_feat_types'][0],
               datainfo['loaded_feat_types'][1],
               datainfo['loaded_feat_types'][2],
               datainfo['loaded_feat_types'][3]))
        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]
        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)
        self.num_train_samples = 0
        self.num_feat = 1
        self.num_labels = 1
        self.is_trained = False
        self.clf = LGBMClassifier(**params)
        # Here you may have parameters and hyper-parameters

    def fit(self, F, y, datainfo, timeinfo):
        '''
        This function should train the model parameters.
        Here we do nothing in this example...
        Args:
            X: Training data matrix of dim num_train_samples * num_feat.
            y: Training label matrix of dim num_train_samples * num_labels.
        Both inputs are numpy arrays.
        If fit is called multiple times on incremental data (train, test1, test2, etc.)
        you should warm-start your training from the pre-trained model. Past data will
        NOT be available for re-training.
        '''

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        self.X = np.nan_to_num(F['numerical'])
        self.y = y

        # Frequency encode categorical variables and concatenate them with numerical variables
        if categorical_cols > 0:
            self.cat_encs = LabelEncoder()
            X_cat = self.cat_encs.fit_transform(F['CAT']).values
            self.X = np.concatenate((self.X, X_cat), axis=1)
            del X_cat

        self.num_train_samples = self.X.shape[0]
        self.num_feat = self.X.shape[1]
        num_train_samples = y.shape[0]

        logging.info("The whole available data is: ")
        logging.info(
            ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.X.shape[0],
                                                      self.X.shape[1]))
        logging.info(
            ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.y.shape[0],
                                                      self.num_labels))

        self.is_trained = True

    def predict(self, F, datainfo, timeinfo):
        '''
        This function should provide predictions of labels on (test) data.
        Here we just return random values...
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves.
        The function predict eventually casdn return probabilities or continuous values.
        '''

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        X = np.nan_to_num(F['numerical'])

        # Frequency encode categorical variables and concatenate them with numerical variables
        if categorical_cols > 0:
            X_cat = self.cat_encs.transform(F['CAT']).values
            X = np.concatenate((X, X_cat), axis=1)
            del X_cat

        # Adversarial validation
        logging.info('AV: starting adversarial validation...')

        np.random.seed(SEED)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

        n_trn = self.X.shape[0]
        n_tst = X.shape[0]
        n_feature = X.shape[1]

        X_all = np.vstack((self.X, X))
        y_all = np.concatenate((np.zeros(n_trn, ), np.ones(n_tst, )))
        logging.info('AV: ', X_all.shape, y_all.shape)

        # Train an adversarial validation classifier
        ps_all = np.zeros_like(y_all, dtype=float)
        for i, (i_trn, i_val) in enumerate(cv.split(X_all, y_all)):

            model_av = LGBMClassifier(**params)
            model_av.fit(X_all[i_trn],
                         y_all[i_trn],
                         eval_set=(X_all[i_val], y_all[i_val]),
                         early_stopping_rounds=10,
                         verbose=10)

            ps_all[i_val] = model_av.predict_proba(X_all[i_val])[:, 1]

        av_score = roc_auc_score(y_all, ps_all)
        logging.info(f'AV: AUC={av_score * 100: 3.2f}')

        ps_all = np.clip(calibrate(ps_all, y_all), .1, .9)
        w_all = ps_all / (1 - ps_all)
        logging.info(
            f'AV: propensity scores deciles: {np.percentile(ps_all, np.linspace(0, 1, 11))}'
        )

        # Training
        X_trn, X_val, y_trn, y_val, w_trn, w_val = train_test_split(
            self.X, self.y, w_all[:n_trn], test_size=.25, random_state=SEED)
        self.clf.fit(X_trn,
                     y_trn,
                     eval_set=(X_val, y_val),
                     early_stopping_rounds=10,
                     verbose=10,
                     sample_weight=w_trn)

        num_test_samples = X.shape[0]
        num_feat = X.shape[1]
        logging.info(
            ("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples,
                                                     num_feat))
        logging.info(
            ("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples,
                                                     self.num_labels))
        y = self.clf.predict_proba(X)[:, 1]
        y = np.transpose(y)
        return y

    def save(self, path="./"):
        pickle.dump(self, open(path + '_model.pickle', "w"))

    def load(self, path="./"):
        modelfile = path + '_model.pickle'
        if isfile(modelfile):
            with open(modelfile) as f:
                self = pickle.load(f)
            logging.info("Model reloaded from: " + modelfile)
        return self
示例#9
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)

    y = trn.target.values

    n_trn = trn.shape[0]

    logging.info(
        'adding a flag to indicate if a customer_id exists in both training and test data'
    )
    trn['cid_both'] = trn.customer_id.isin(tst.customer_id.tolist()).astype(
        np.int64)
    tst['cid_both'] = tst.customer_id.isin(trn.customer_id.tolist()).astype(
        np.int64)
    num_cols = ['cid_both']

    logging.info('converting the date column into datetime')
    trn['date'] = pd.to_datetime(trn.date, format='%m%d%Y')
    tst['date'] = pd.to_datetime(tst.date, format='%m%d%Y')

    logging.info('add the month feature')
    trn['month'] = trn.date.dt.month
    tst['month'] = tst.date.dt.month

    logging.info('combining cid_5, month, and market')
    trn['cid_5_month_market'] = (
        trn.customer_id //
        1e7) * 1e4 + trn.month * 100 + trn.market.str[1:].astype(int)
    tst['cid_5_month_market'] = (
        tst.customer_id //
        1e7) * 1e4 + tst.month * 100 + tst.market.str[1:].astype(int)

    logging.info('combining cid_3, month, and market')
    trn['cid_3_month_market'] = (
        (trn.customer_id // 1e4) %
        1e3) * 1e4 + trn.month * 100 + trn.market.str[1:].astype(int)
    tst['cid_3_month_market'] = (
        (tst.customer_id // 1e4) %
        1e3) * 1e4 + tst.month * 100 + tst.market.str[1:].astype(int)

    cat_cols = ['cid_5_month_market', 'cid_3_month_market']

    logging.info('label encoding categorical variables')
    lbe = LabelEncoder(min_obs=10)
    trn.ix[:, cat_cols] = lbe.fit_transform(trn[cat_cols].values)
    tst.ix[:, cat_cols] = lbe.transform(tst[cat_cols].values)

    logging.info('mean-target encoding for categorical columns')
    for col in cat_cols:
        colname = 'mt_{}'.format(col)
        mean_target = trn[[col, 'target']].groupby(col).mean()
        mapping = mean_target.to_dict()['target']
        trn[colname] = trn[col].map(mapping)
        tst[colname] = tst[col].map(mapping)
        num_cols.append(colname)

    feature_cols = num_cols + cat_cols

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(feature_cols):
            if col in num_cols:
                f.write('{}\t{}\tq\n'.format(i, col))
            else:
                f.write('{}\t{}\tint\n'.format(i, col))

    logging.info('saving features')
    save_data(trn[feature_cols].values.astype(float), y, train_feature_file)
    save_data(tst[feature_cols].values.astype(float), None, test_feature_file)