def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col='id') tst = pd.read_csv(test_file, index_col='id') y = trn.loss.values n_trn = trn.shape[0] trn.drop('loss', axis=1, inplace=True) cat_cols = [x for x in trn.columns if trn[x].dtype == np.object] num_cols = [x for x in trn.columns if trn[x].dtype != np.object] logging.info('categorical: {}, numerical: {}'.format( len(cat_cols), len(num_cols))) df = pd.concat([trn, tst], axis=0) logging.info('label encoding categorical variables') lbe = LabelEncoder(min_obs=10) df.ix[:, cat_cols] = lbe.fit_transform(df[cat_cols].values) with open(feature_map_file, 'w') as f: for i, col in enumerate(df.columns): f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') save_data(df.values[:n_trn, ], y, train_feature_file) save_data(df.values[n_trn:, ], None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_header_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col=ID_COL) tst = pd.read_csv(test_file, index_col=ID_COL) y = trn[TARGET_COL].values n_trn = trn.shape[0] trn.drop(TARGET_COL, axis=1, inplace=True) cat_cols = [x for x in trn.columns if trn[x].dtype == np.object] num_cols = [x for x in trn.columns if trn[x].dtype != np.object] logging.info(f'categorical: {len(cat_cols)}, numerical: {len(num_cols)}') df = pd.concat([trn, tst], axis=0) logging.info('label encoding categorical variables') lbe = LabelEncoder(min_obs=10) df[cat_cols] = lbe.fit_transform(df[cat_cols]) df[num_cols] = df[num_cols].fillna(-1) with open(feature_header_file, 'w') as f: for i, col in enumerate(df.columns): f.write(f'{col}\n') logging.info('saving features') save_data(df.values[:n_trn, ], y, train_feature_file) save_data(df.values[n_trn:, ], None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file) tst = pd.read_csv(test_file) y = trn[TARGET_COL] n_trn = trn.shape[0] features = [x for x in trn.columns if x not in [ID_COL, TARGET_COL]] df = pd.concat( [trn.drop([TARGET_COL, ID_COL], axis=1), tst.drop(ID_COL, axis=1)], axis=0) logging.info('label encoding') lbe = LabelEncoder(min_obs=50) df[features] = lbe.fit_transform(df[features]) with open(feature_map_file, 'w') as f: for i, col in enumerate(features): f.write('{}\t{}\tint\n'.format(i, col)) logging.info('saving features') save_data(df.values[:n_trn], y.values, train_feature_file) save_data(df.values[n_trn:], None, test_feature_file)
def test(): df = pd.DataFrame(np.random.randint(0, 1000, size=(1000000, 10)), columns=['c{}'.format(x) for x in range(10)]) profiler = cProfile.Profile(subcalls=True, builtins=True, timeunit=.001) lbe = LabelEncoder(min_obs=100) profiler.enable() lbe.fit(df) X_new = lbe.transform(df) profiler.disable() profiler.print_stats()
def fit(self, F, y, datainfo,timeinfo): ''' This function should train the model parameters. Here we do nothing in this example... Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. ''' overall_spenttime=time.time()-timeinfo[0] dataset_spenttime=time.time()-timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s X = np.nan_to_num(F['numerical']) # Frequency encode categorical variables and concatenate them with numerical variables if categorical_cols > 0: self.cat_encs = LabelEncoder() X_cat = self.cat_encs.fit_transform(F['CAT']).values X = np.concatenate((X, X_cat), axis=1) del X_cat self.num_train_samples = X.shape[0] self.num_feat = X.shape[1] num_train_samples = y.shape[0] self.DataX = X self.DataY = y logging.info ("The whole available data is: ") logging.info(("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.DataX.shape[0],self.DataX.shape[1])) logging.info(("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.DataY.shape[0], self.num_labels)) X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=.25, random_state=SEED) self.clf.fit(X_trn, y_trn, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=10) if (self.num_train_samples != num_train_samples): logging.info("ARRGH: number of samples in X and y do not match!") self.is_trained=True
def test(): df = pd.DataFrame( np.random.randint(0, N_CATEGORY, size=(N_OBS, N_FEATURE)), columns=["c{}".format(x) for x in range(N_FEATURE)], ) profiler = cProfile.Profile(subcalls=True, builtins=True, timeunit=0.001) lbe = LabelEncoder(min_obs=100) profiler.enable() lbe.fit(df) _ = lbe.transform(df) profiler.disable() profiler.print_stats()
def fit(self, F, y, datainfo, timeinfo): ''' This function should train the model parameters. Here we do nothing in this example... Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. ''' overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s self.X = F['numerical'] self.y = y # Frequency encode categorical variables and concatenate them with numerical variables if categorical_cols > 0: self.cat_encs = LabelEncoder() X_cat = self.cat_encs.fit_transform(F['CAT']).values self.X = np.concatenate((self.X, X_cat), axis=1) del X_cat self.num_train_samples = self.X.shape[0] self.num_feat = self.X.shape[1] num_train_samples = y.shape[0] logging.info("The whole available data is: ") logging.info( ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.X.shape[0], self.X.shape[1])) logging.info( ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.y.shape[0], self.num_labels)) self.is_trained = True
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file) tst = pd.read_csv(test_file) trn['date'] = trn.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y')) tst['date'] = tst.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y')) trn['year_2017'] = trn.date.apply(lambda x: x.year - 2016) tst['year_2017'] = tst.date.apply(lambda x: x.year - 2016) trn['month'] = trn.date.apply(lambda x: x.month) tst['month'] = tst.date.apply(lambda x: x.month) y = trn.target.values n_trn = trn.shape[0] trn.drop(['target', 'date', 'f_19'], axis=1, inplace=True) tst.drop(['id', 'date', 'f_19'], axis=1, inplace=True) cat_cols = ['customer_id' ] + [x for x in trn.columns if trn[x].dtype == np.object] num_cols = [x for x in trn.columns if trn[x].dtype != np.object] logging.info('categorical: {}, numerical: {}'.format( len(cat_cols), len(num_cols))) logging.info('label encoding categorical variables') lbe = LabelEncoder(min_obs=10) trn.ix[:, cat_cols] = lbe.fit_transform(trn[cat_cols].values) tst.ix[:, cat_cols] = lbe.transform(tst[cat_cols].values) with open(feature_map_file, 'w') as f: for i, col in enumerate(trn.columns): f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') save_data(trn.values.astype(float), y, train_feature_file) save_data(tst.values.astype(float), None, test_feature_file)
class Model: def __init__(self,datainfo,timeinfo): ''' This constructor is supposed to initialize data members. Use triple quotes for function documentation. ''' # Just logging.info some info from the datainfo variable logging.info("The Budget for this data set is: %d seconds" %datainfo['time_budget']) logging.info("Loaded %d time features, %d numerical Features, %d categorical features and %d multi valued categorical variables" %(datainfo['loaded_feat_types'][0], datainfo['loaded_feat_types'][1],datainfo['loaded_feat_types'][2],datainfo['loaded_feat_types'][3])) overall_spenttime=time.time()-timeinfo[0] dataset_spenttime=time.time()-timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) self.num_train_samples=0 self.num_feat=1 self.num_labels=1 self.is_trained=False self.clf = LGBMClassifier(**params) # Here you may have parameters and hyper-parameters def fit(self, F, y, datainfo,timeinfo): ''' This function should train the model parameters. Here we do nothing in this example... Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. ''' overall_spenttime=time.time()-timeinfo[0] dataset_spenttime=time.time()-timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s self.X = np.nan_to_num(F['numerical']) self.y = y # Frequency encode categorical variables and concatenate them with numerical variables if categorical_cols > 0: self.cat_encs = LabelEncoder() X_cat = self.cat_encs.fit_transform(F['CAT']).values self.X = np.concatenate((self.X, X_cat), axis=1) del X_cat self.num_train_samples = self.X.shape[0] self.num_feat = self.X.shape[1] num_train_samples = y.shape[0] logging.info ("The whole available data is: ") logging.info(("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.X.shape[0],self.X.shape[1])) logging.info(("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.y.shape[0], self.num_labels)) self.is_trained=True def predict(self, F,datainfo,timeinfo): ''' This function should provide predictions of labels on (test) data. Here we just return random values... Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually casdn return probabilities or continuous values. ''' overall_spenttime=time.time()-timeinfo[0] dataset_spenttime=time.time()-timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s X = np.nan_to_num(F['numerical']) # Frequency encode categorical variables and concatenate them with numerical variables if categorical_cols > 0: X_cat = self.cat_encs.transform(F['CAT']).values X = np.concatenate((X, X_cat), axis=1) del X_cat # Adversarial validation logging.info('AV: starting adversarial validation...') np.random.seed(SEED) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED) n_trn = self.X.shape[0] n_tst = X.shape[0] n_feature = X.shape[1] X_all = np.vstack((self.X, X)) y_all = np.concatenate((np.zeros(n_trn,), np.ones(n_tst,))) logging.info(f'AV: {X_all.shape}, {y_all.shape}') logging.info(f'AV: {np.unique(y_all)}') av_auc = 1. cols = np.arange(n_feature) count = 0 av_auc_threshold = .8 while av_auc > av_auc_threshold: model_av = RandomForestClassifier(min_samples_leaf=20, min_impurity_decrease=.01, random_state=SEED) model_av.fit(X_all[:, cols], y_all) ps_all = model_av.predict_proba(X_all[:, cols])[:, 1] av_auc = roc_auc_score(y_all, ps_all) logging.info(f'AV #{count}: AUC={av_auc * 100: 3.2f}') imp = pd.DataFrame({'feature': cols, 'importance': model_av.feature_importances_}) imp = imp.sort_values('importance', ascending=False) logging.info(f'AV #{count}: feature importance\n{imp.head(10)}') # Select features cols_to_drop = imp.loc[imp.importance > GINI_THRESHOLD, 'feature'].values[:int(np.ceil(len(cols) * .1))] logging.info(f'AV #{count}: columns to drop: {cols_to_drop}') if av_auc <= av_auc_threshold or len(cols_to_drop) == 0: break cols = [x for x in cols if x not in cols_to_drop] logging.info(f'AV #{count}: columns to keep: {cols}') count += 1 X = X[:, cols] self.X = self.X[:, cols] logging.info(f'AV: # of features after selection: {X.shape[1]}') # Training X_trn, X_val, y_trn, y_val = train_test_split(self.X, self.y, test_size=.25, random_state=SEED) self.clf.fit(X_trn, y_trn, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=10) num_test_samples = X.shape[0] if X.ndim > 1: num_feat = X.shape[1] logging.info(("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat)) logging.info(("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels)) y= self.clf.predict_proba(X)[:, 1] y= np.transpose(y) return y def save(self, path="./"): pickle.dump(self, open(path + '_model.pickle', "w")) def load(self, path="./"): modelfile = path + '_model.pickle' if isfile(modelfile): with open(modelfile) as f: self = pickle.load(f) logging.info("Model reloaded from: " + modelfile) return self
class Model: def __init__(self,datainfo,timeinfo): ''' This constructor is supposed to initialize data members. Use triple quotes for function documentation. ''' # Just logging.info some info from the datainfo variable logging.info("The Budget for this data set is: %d seconds" %datainfo['time_budget']) logging.info("Loaded %d time features, %d numerical Features, %d categorical features and %d multi valued categorical variables" %(datainfo['loaded_feat_types'][0], datainfo['loaded_feat_types'][1],datainfo['loaded_feat_types'][2],datainfo['loaded_feat_types'][3])) overall_spenttime=time.time()-timeinfo[0] dataset_spenttime=time.time()-timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) self.num_train_samples=0 self.num_feat=1 self.num_labels=1 self.is_trained=False self.clf = LGBMClassifier(**params) # Here you may have parameters and hyper-parameters def fit(self, F, y, datainfo,timeinfo): ''' This function should train the model parameters. Here we do nothing in this example... Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. ''' overall_spenttime=time.time()-timeinfo[0] dataset_spenttime=time.time()-timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s X = np.nan_to_num(F['numerical']) # Frequency encode categorical variables and concatenate them with numerical variables if categorical_cols > 0: self.cat_encs = LabelEncoder() X_cat = self.cat_encs.fit_transform(F['CAT']).values X = np.concatenate((X, X_cat), axis=1) del X_cat self.num_train_samples = X.shape[0] self.num_feat = X.shape[1] num_train_samples = y.shape[0] self.DataX = X self.DataY = y logging.info ("The whole available data is: ") logging.info(("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.DataX.shape[0],self.DataX.shape[1])) logging.info(("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.DataY.shape[0], self.num_labels)) X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=.25, random_state=SEED) self.clf.fit(X_trn, y_trn, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=10) if (self.num_train_samples != num_train_samples): logging.info("ARRGH: number of samples in X and y do not match!") self.is_trained=True def predict(self, F,datainfo,timeinfo): ''' This function should provide predictions of labels on (test) data. Here we just return random values... Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually casdn return probabilities or continuous values. ''' overall_spenttime=time.time()-timeinfo[0] dataset_spenttime=time.time()-timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s X = np.nan_to_num(F['numerical']) # Frequency encode categorical variables and concatenate them with numerical variables if categorical_cols > 0: X_cat = self.cat_encs.transform(F['CAT']).values X = np.concatenate((X, X_cat), axis=1) del X_cat num_test_samples = X.shape[0] if X.ndim > 1: num_feat = X.shape[1] logging.info(("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat)) if (self.num_feat != num_feat): logging.info("ARRGH: number of features in X does not match training data!") logging.info(("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels)) y= self.clf.predict_proba(X)[:, 1] y= np.transpose(y) return y def save(self, path="./"): pickle.dump(self, open(path + '_model.pickle', "w")) def load(self, path="./"): modelfile = path + '_model.pickle' if isfile(modelfile): with open(modelfile) as f: self = pickle.load(f) logging.info("Model reloaded from: " + modelfile) return self
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file, cv_id_file): logging.info('loading raw data') trn = pd.read_csv(train_file) tst = pd.read_csv(test_file) y = trn.target.values logging.info('converting the date column into datetime') trn['date'] = pd.to_datetime(trn.date, format='%m%d%Y') tst['date'] = pd.to_datetime(tst.date, format='%m%d%Y') logging.info('add year and month features') trn['year_2017'] = trn.date.dt.year - 2016 tst['year_2017'] = tst.date.dt.year - 2016 logging.info('add the month feature') trn['month'] = trn.date.dt.month tst['month'] = tst.date.dt.month logging.info('splitting customer_ids into first 5 and next 3 digits') trn['cid_5'] = trn.customer_id // 1e7 tst['cid_5'] = tst.customer_id // 1e7 trn['cid_3'] = (trn.customer_id // 1e4) % 1e3 tst['cid_3'] = (tst.customer_id // 1e4) % 1e3 logging.info( 'adding a flag to indicate if a customer_id exists in both training and test data' ) trn['cid_both'] = trn.customer_id.isin(tst.customer_id.tolist()).astype( np.int64) tst['cid_both'] = tst.customer_id.isin(trn.customer_id.tolist()).astype( np.int64) logging.info('combining cid_5, month, and market') trn['cid_5_month_market'] = trn.cid_5 * 1e4 + trn.month * 100 + trn.market.str[ 1:].astype(int) tst['cid_5_month_market'] = tst.cid_5 * 1e4 + tst.month * 100 + tst.market.str[ 1:].astype(int) logging.info('combining cid_3, month, and market') trn['cid_3_month_market'] = trn.cid_3 * 1e4 + trn.month * 100 + trn.market.str[ 1:].astype(int) tst['cid_3_month_market'] = tst.cid_3 * 1e4 + tst.month * 100 + tst.market.str[ 1:].astype(int) logging.info('drop unused columns') trn.drop(COLS_TO_DROP, axis=1, inplace=True) tst.drop(['id'] + COLS_TO_DROP, axis=1, inplace=True) cat_cols = [ 'customer_id', 'cid_5', 'cid_3', 'cid_5_month_market', 'cid_3_month_market' ] cat_cols += [x for x in trn.columns if trn[x].dtype == np.object] float_cols = [x for x in trn.columns if trn[x].dtype == np.float64] int_cols = [ x for x in trn.columns if x not in ['target'] + cat_cols + float_cols ] logging.info('categorical: {}, float: {}, int: {}'.format( len(cat_cols), len(float_cols), len(int_cols))) logging.info('label encoding categorical variables') lbe = LabelEncoder(min_obs=10) trn.ix[:, cat_cols] = lbe.fit_transform(trn[cat_cols].values) tst.ix[:, cat_cols] = lbe.transform(tst[cat_cols].values) logging.info('min-max scaling float columns') scaler = MinMaxScaler() trn.ix[:, float_cols] = scaler.fit_transform(trn[float_cols].values) tst.ix[:, float_cols] = scaler.transform(tst[float_cols].values) logging.info('adding interactions with f_5') interaction_cols = [ 'f_8', 'f_12', 'f_18', 'f_11', 'f_13', 'f_21', 'f_15', 'f_26' ] feature_cols = cat_cols + float_cols + int_cols for col in interaction_cols: trn['f_5+{}'.format(col)] = trn.f_5 * 10 + trn[col] tst['f_5+{}'.format(col)] = tst.f_5 * 10 + tst[col] feature_cols.append('f_5+{}'.format(col)) for col1, col2 in combinations(interaction_cols, 2): logging.info('adding interactions between {} and {}'.format( col1, col2)) trn['{}+{}'.format(col1, col2)] = trn[col1] + trn[col2] tst['{}+{}'.format(col1, col2)] = tst[col1] + tst[col2] trn['{}-{}'.format(col1, col2)] = trn[col1] - trn[col2] tst['{}-{}'.format(col1, col2)] = tst[col1] - tst[col2] trn['{}x{}'.format( col1, col2)] = trn[col1].apply(np.log1p) + trn[col2].apply(np.log1p) tst['{}x{}'.format( col1, col2)] = tst[col1].apply(np.log1p) + tst[col2].apply(np.log1p) trn['{}/{}'.format( col1, col2)] = trn[col1].apply(np.log1p) - trn[col2].apply(np.log1p) tst['{}/{}'.format( col1, col2)] = tst[col1].apply(np.log1p) - tst[col2].apply(np.log1p) feature_cols += [ '{}+{}'.format(col1, col2), '{}-{}'.format(col1, col2), '{}x{}'.format(col1, col2), '{}/{}'.format(col1, col2) ] logging.info('generate CV features') feature_name, feature_ext = os.path.splitext(train_feature_file) feature_name = os.path.splitext(feature_name)[0] logging.info('Loading CV Ids') cv_id = np.loadtxt(cv_id_file) for i in range(1, N_FOLD + 1): i_trn = np.where(cv_id != i)[0] i_val = np.where(cv_id == i)[0] cv_feature_cols = [] logging.info( 'mean-target encoding for categorical columns for CV #{}'.format( i)) cv_trn = trn[cat_cols + [TARGET]].copy() cv_tst = tst[cat_cols].copy() for col in cat_cols: mean_target = cv_trn.iloc[i_trn][[col, 'target']].groupby(col).mean() mapping = mean_target.to_dict()['target'] cv_trn[col] = cv_trn[col].map(mapping) cv_tst[col] = cv_tst[col].map(mapping) cv_feature_cols += cat_cols logging.info( 'adding min, max, mean of mean-target encodings of categorical columns' ) cv_trn['min_target_encoding'] = cv_trn[cat_cols].min(axis=1) cv_trn['max_target_encoding'] = cv_trn[cat_cols].max(axis=1) cv_trn['median_target_encoding'] = cv_trn[cat_cols].median(axis=1) cv_tst['min_target_encoding'] = cv_tst[cat_cols].min(axis=1) cv_tst['max_target_encoding'] = cv_tst[cat_cols].max(axis=1) cv_tst['median_target_encoding'] = cv_tst[cat_cols].median(axis=1) cv_feature_cols += [ 'min_target_encoding', 'max_target_encoding', 'median_target_encoding' ] logging.info('saving features for CV #{}'.format(i)) save_data(cv_trn[cv_feature_cols].values.astype(float), y, '{}.trn{}{}'.format(feature_name, i, feature_ext)) save_data(cv_tst[cv_feature_cols].values.astype(float), None, '{}.tst{}{}'.format(feature_name, i, feature_ext)) logging.info('saving non-CV features') save_data(trn[feature_cols].values.astype(float), y, train_feature_file) save_data(tst[feature_cols].values.astype(float), None, test_feature_file) with open(feature_map_file, 'w') as f: for i, col in enumerate(feature_cols + cv_feature_cols): f.write('{}\t{}\tq\n'.format(i, col))
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file) tst = pd.read_csv(test_file) logging.info('converting the date column into datetime') trn['date'] = trn.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y')) tst['date'] = tst.date.apply(lambda x: pd.to_datetime(x, format='%m%d%Y')) logging.info('add year and month features') trn['year_2017'] = trn.date.dt.year - 2016 tst['year_2017'] = tst.date.dt.year - 2016 trn['month'] = trn.date.dt.month tst['month'] = tst.date.dt.month y = trn.target.values n_trn = trn.shape[0] logging.info('splitting customer ids into first 8 digits') trn['cid_8'] = trn.customer_id // 10000 tst['cid_8'] = tst.customer_id // 10000 logging.info('drop unused columns') trn.drop(['target', 'date', 'f_19', 'customer_id'], axis=1, inplace=True) tst.drop(['id', 'date', 'f_19', 'customer_id'], axis=1, inplace=True) cat_cols = ['cid_8' ] + [x for x in trn.columns if trn[x].dtype == np.object] float_cols = [x for x in trn.columns if trn[x].dtype == np.float64] int_cols = [ x for x in trn.columns if (trn[x].dtype == np.int64) & (x != 'cid_8') ] logging.info('categorical: {}, float: {}, int: {}'.format( len(cat_cols), len(float_cols), len(int_cols))) logging.info('label encoding categorical variables') lbe = LabelEncoder(min_obs=10) trn.ix[:, cat_cols] = lbe.fit_transform(trn[cat_cols].values) tst.ix[:, cat_cols] = lbe.transform(tst[cat_cols].values) logging.info('min-max scaling float columns') scaler = MinMaxScaler() trn.ix[:, float_cols] = scaler.fit_transform(trn[float_cols].values) tst.ix[:, float_cols] = scaler.transform(tst[float_cols].values) logging.info('adding interactions') trn['f_5+f_21'] = trn.f_5 + trn.f_21 tst['f_5+f_21'] = tst.f_5 + tst.f_21 float_cols.append('f_5+f_21') interaction_cols = ['f_13', 'f_21', 'f_15', 'f_26'] for col1, col2 in combinations(interaction_cols, 2): logging.info('adding interactions between {} and {}'.format( col1, col2)) trn['{}+{}'.format(col1, col2)] = trn[col1] + trn[col2] tst['{}+{}'.format(col1, col2)] = tst[col1] + tst[col2] trn['{}-{}'.format(col1, col2)] = trn[col1] - trn[col2] tst['{}-{}'.format(col1, col2)] = tst[col1] - tst[col2] trn['{}x{}'.format( col1, col2)] = trn[col1].apply(np.log1p) + trn[col2].apply(np.log1p) tst['{}x{}'.format( col1, col2)] = tst[col1].apply(np.log1p) + tst[col2].apply(np.log1p) trn['{}/{}'.format( col1, col2)] = trn[col1].apply(np.log1p) - trn[col2].apply(np.log1p) tst['{}/{}'.format( col1, col2)] = tst[col1].apply(np.log1p) - tst[col2].apply(np.log1p) float_cols += [ '{}+{}'.format(col1, col2), '{}-{}'.format(col1, col2), '{}x{}'.format(col1, col2), '{}/{}'.format(col1, col2) ] with open(feature_map_file, 'w') as f: for i, col in enumerate(trn.columns): if col in cat_cols + int_cols: f.write('{}\t{}\tint\n'.format(i, col)) else: f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') save_data(trn.values.astype(float), y, train_feature_file) save_data(tst.values.astype(float), None, test_feature_file)
class Model: def __init__(self, datainfo, timeinfo): ''' This constructor is supposed to initialize data members. Use triple quotes for function documentation. ''' # Just logging.info some info from the datainfo variable logging.info("The Budget for this data set is: %d seconds" % datainfo['time_budget']) logging.info( "Loaded %d time features, %d numerical Features, %d categorical features and %d multi valued categorical variables" % (datainfo['loaded_feat_types'][0], datainfo['loaded_feat_types'][1], datainfo['loaded_feat_types'][2], datainfo['loaded_feat_types'][3])) overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) self.num_train_samples = 0 self.num_feat = 1 self.num_labels = 1 self.is_trained = False self.clf = LGBMClassifier(**params) # Here you may have parameters and hyper-parameters def fit(self, F, y, datainfo, timeinfo): ''' This function should train the model parameters. Here we do nothing in this example... Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. ''' overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s self.X = np.nan_to_num(F['numerical']) self.y = y # Frequency encode categorical variables and concatenate them with numerical variables if categorical_cols > 0: self.cat_encs = LabelEncoder() X_cat = self.cat_encs.fit_transform(F['CAT']).values self.X = np.concatenate((self.X, X_cat), axis=1) del X_cat self.num_train_samples = self.X.shape[0] self.num_feat = self.X.shape[1] num_train_samples = y.shape[0] logging.info("The whole available data is: ") logging.info( ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.X.shape[0], self.X.shape[1])) logging.info( ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.y.shape[0], self.num_labels)) self.is_trained = True def predict(self, F, datainfo, timeinfo): ''' This function should provide predictions of labels on (test) data. Here we just return random values... Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually casdn return probabilities or continuous values. ''' overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s X = np.nan_to_num(F['numerical']) # Frequency encode categorical variables and concatenate them with numerical variables if categorical_cols > 0: X_cat = self.cat_encs.transform(F['CAT']).values X = np.concatenate((X, X_cat), axis=1) del X_cat # Adversarial validation logging.info('AV: starting adversarial validation...') np.random.seed(SEED) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED) n_trn = self.X.shape[0] n_tst = X.shape[0] n_feature = X.shape[1] X_all = np.vstack((self.X, X)) y_all = np.concatenate((np.zeros(n_trn, ), np.ones(n_tst, ))) logging.info('AV: ', X_all.shape, y_all.shape) # Train an adversarial validation classifier ps_all = np.zeros_like(y_all, dtype=float) for i, (i_trn, i_val) in enumerate(cv.split(X_all, y_all)): model_av = LGBMClassifier(**params) model_av.fit(X_all[i_trn], y_all[i_trn], eval_set=(X_all[i_val], y_all[i_val]), early_stopping_rounds=10, verbose=10) ps_all[i_val] = model_av.predict_proba(X_all[i_val])[:, 1] av_score = roc_auc_score(y_all, ps_all) logging.info(f'AV: AUC={av_score * 100: 3.2f}') ps_all = np.clip(calibrate(ps_all, y_all), .1, .9) w_all = ps_all / (1 - ps_all) logging.info( f'AV: propensity scores deciles: {np.percentile(ps_all, np.linspace(0, 1, 11))}' ) # Training X_trn, X_val, y_trn, y_val, w_trn, w_val = train_test_split( self.X, self.y, w_all[:n_trn], test_size=.25, random_state=SEED) self.clf.fit(X_trn, y_trn, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=10, sample_weight=w_trn) num_test_samples = X.shape[0] num_feat = X.shape[1] logging.info( ("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat)) logging.info( ("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels)) y = self.clf.predict_proba(X)[:, 1] y = np.transpose(y) return y def save(self, path="./"): pickle.dump(self, open(path + '_model.pickle', "w")) def load(self, path="./"): modelfile = path + '_model.pickle' if isfile(modelfile): with open(modelfile) as f: self = pickle.load(f) logging.info("Model reloaded from: " + modelfile) return self
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file) tst = pd.read_csv(test_file) y = trn.target.values n_trn = trn.shape[0] logging.info( 'adding a flag to indicate if a customer_id exists in both training and test data' ) trn['cid_both'] = trn.customer_id.isin(tst.customer_id.tolist()).astype( np.int64) tst['cid_both'] = tst.customer_id.isin(trn.customer_id.tolist()).astype( np.int64) num_cols = ['cid_both'] logging.info('converting the date column into datetime') trn['date'] = pd.to_datetime(trn.date, format='%m%d%Y') tst['date'] = pd.to_datetime(tst.date, format='%m%d%Y') logging.info('add the month feature') trn['month'] = trn.date.dt.month tst['month'] = tst.date.dt.month logging.info('combining cid_5, month, and market') trn['cid_5_month_market'] = ( trn.customer_id // 1e7) * 1e4 + trn.month * 100 + trn.market.str[1:].astype(int) tst['cid_5_month_market'] = ( tst.customer_id // 1e7) * 1e4 + tst.month * 100 + tst.market.str[1:].astype(int) logging.info('combining cid_3, month, and market') trn['cid_3_month_market'] = ( (trn.customer_id // 1e4) % 1e3) * 1e4 + trn.month * 100 + trn.market.str[1:].astype(int) tst['cid_3_month_market'] = ( (tst.customer_id // 1e4) % 1e3) * 1e4 + tst.month * 100 + tst.market.str[1:].astype(int) cat_cols = ['cid_5_month_market', 'cid_3_month_market'] logging.info('label encoding categorical variables') lbe = LabelEncoder(min_obs=10) trn.ix[:, cat_cols] = lbe.fit_transform(trn[cat_cols].values) tst.ix[:, cat_cols] = lbe.transform(tst[cat_cols].values) logging.info('mean-target encoding for categorical columns') for col in cat_cols: colname = 'mt_{}'.format(col) mean_target = trn[[col, 'target']].groupby(col).mean() mapping = mean_target.to_dict()['target'] trn[colname] = trn[col].map(mapping) tst[colname] = tst[col].map(mapping) num_cols.append(colname) feature_cols = num_cols + cat_cols with open(feature_map_file, 'w') as f: for i, col in enumerate(feature_cols): if col in num_cols: f.write('{}\t{}\tq\n'.format(i, col)) else: f.write('{}\t{}\tint\n'.format(i, col)) logging.info('saving features') save_data(trn[feature_cols].values.astype(float), y, train_feature_file) save_data(tst[feature_cols].values.astype(float), None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col=ID_COL) tst = pd.read_csv(test_file, index_col=ID_COL) logging.info('label encoding categorical variables') y = trn.loc[:, TARGET_COL] n_trn = trn.shape[0] trn = trn.drop(TARGET_COL, axis=1) df = pd.concat([trn, tst], axis=0) # build features features_bin = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4'] features_cat = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'] features_hex = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'] features_ord = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5'] features_cyc = ['day', 'month'] logging.info("bin 0 to 4") # convert bins 0, 1, 2 to object so that # get_dummies recognizes them and creates missing indicators bin_012 = ['bin_0', 'bin_1', 'bin_2'] df[bin_012] = df[bin_012].astype(object) dummies = pd.get_dummies(df[features_bin], dummy_na=True) df = df.drop(features_bin, axis=1) df = pd.concat([df, dummies], axis=1) logging.info("nom 0 to nom 4") le = LabelEncoder(min_obs=10) df.loc[:, features_cat] = le.fit_transform(df.loc[:, features_cat]) logging.info("nom 5 to 9") cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) te = TargetEncoder(cv=cv) te.fit(trn.loc[:, features_hex], y) df.loc[:, features_hex] = te.transform(df.loc[:, features_hex]) logging.info("ord 0 to 5") map_ord_0 = None # already a numeric column map_ord_1 = {'Novice': 1, 'Contributor': 2, 'Expert': 3, 'Master': 4, 'Grandmaster': 5} map_ord_2 = {'Freezing': 1, 'Cold': 2, 'Warm': 3, 'Hot': 4, 'Boiling Hot': 5, 'Lava Hot': 6} map_ord_3 = dict(zip(df['ord_3'].value_counts().sort_index().keys(), range(1, len(df['ord_3'].value_counts()) + 1))) map_ord_4 = dict(zip(df['ord_4'].value_counts().sort_index().keys(), range(1, len(df['ord_4'].value_counts()) + 1))) temp_ord_5 = pd.DataFrame( df['ord_5'].value_counts().sort_index().keys(), columns=['ord_5']) temp_ord_5['First'] = temp_ord_5['ord_5'].astype(str).str[0].str.upper() temp_ord_5['Second'] = temp_ord_5['ord_5'].astype(str).str[1].str.upper() temp_ord_5['First'] = temp_ord_5['First'].replace(map_ord_4) temp_ord_5['Second'] = temp_ord_5['Second'].replace(map_ord_4) temp_ord_5['Add'] = temp_ord_5['First'] + temp_ord_5['Second'] temp_ord_5['Mul'] = temp_ord_5['First'] * temp_ord_5['Second'] map_ord_5 = dict(zip(temp_ord_5['ord_5'], temp_ord_5['Mul'])) maps = [map_ord_0, map_ord_1, map_ord_2, map_ord_3, map_ord_4, map_ord_5] for i, m in zip(range(0, 6), maps): if i != 0: df[f'ord_{i}'] = df[f'ord_{i}'].map(m) df[f'ord_{i}'] = (df[f'ord_{i}'].fillna(df[f'ord_{i}'].median())) logging.info("cyclical features") df[features_cyc] = df[features_cyc].astype(object) dummies_cyc = pd.get_dummies(df[features_cyc], dummy_na=True) df = df.drop(features_cyc, axis=1) df = pd.concat([df, dummies_cyc], axis=1) with open(feature_map_file, 'w') as f: for i, col in enumerate(df.columns): f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') save_data(df.values[:n_trn, ], y.values, train_feature_file) save_data(df.values[n_trn:, ], None, test_feature_file)