def train(dataset, features, reg_metric, algo='lightgbm', n_folds=5, config=None): models = [] folds = GroupKFold(n_splits=n_folds) groups = dataset['installation_id'] X = dataset[features].copy() y = dataset['accuracy_group'] oof = np.zeros(X.shape[0], dtype=np.float32) cv = OrderedDict() model_cls = get_model_class(algo) metric = getattr(reg_metric, algo) feat_imp = np.zeros(len(features), dtype=np.float32) for i, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups), 1): U.log(f'Running k-fold {i} of {n_folds}') x_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx] x_val, y_val = X.iloc[val_idx], y.iloc[val_idx] model = model_cls(config or get_default_config(algo)) model.fit(train_data=(x_trn, y_trn), valid_data=(x_val, y_val), metric=metric) oof[val_idx] = model.predict(x_val) cv[f'cv_cappa_{i}'] = np.mean(reg_metric(y_val, oof[val_idx])) models.append(model) feat_imp += model.feature_importances.values feat_imp /= n_folds feat_imp = pd.Series(OrderedDict(zip(features, feat_imp))) return U.named_tuple('Result', models=models, cv=cv, oof=oof, fi=feat_imp)
def attempt_outcomes(session, meta): """Computes how many successful and unsuccessful attempts contains the session.""" event_code = meta.win_codes.get(session.title.iloc[0], 4100) total_attempts = session.query(f'event_code == {event_code}') pos = total_attempts.event_data.str.contains('true').sum() neg = total_attempts.event_data.str.contains('false').sum() summary = dict(pos=pos, neg=neg, total=(pos + neg)) return U.named_tuple('Trial', **summary)
def train(self, dataset, features, fold, target='accuracy_group', grouping='installation_id', config=None): assert target not in features assert grouping in dataset or grouping is None groups = dataset[grouping] X = dataset[features] y = dataset[target] model_cls = get_model_class(self.algo) n_folds = fold.get_n_splits() models = [] feat_imp = np.zeros(len(features), dtype=np.float32) oof = np.zeros(X.shape[0], dtype=np.float32) cv = OrderedDict() for i, (trn_idx, val_idx) in enumerate(fold.split(X, y, groups), 1): U.log(f'Running k-fold {i} of {n_folds}') x_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx] x_val, y_val = X.iloc[val_idx], y.iloc[val_idx] model = model_cls(config or get_default_config(self.algo)) model.fit(train_data=(x_trn, y_trn), valid_data=(x_val, y_val), metric=self.eval_metric) oof[val_idx] = model.predict(x_val) for name, metric in self.cv_metrics.items(): cv[f'cv_{name}_{i}'] = metric(y_val, oof[val_idx]) models.append(model) if model.has_feature_importance: feat_imp += model.feature_importances.values if cv: U.log('Fold evaluation results:') U.log(U.dict_format(cv)) feat_imp /= n_folds feat_imp = pd.Series(OrderedDict(zip(features, feat_imp))) return U.named_tuple('Result', models=models, cv=cv, oof=oof, fi=feat_imp)
def compute_meta_data(dataset, *datasets): datasets = [dataset] + list(datasets) uniq = OrderedDict() uniq['title_event_code'] = U.unique(datasets, column='title_event_code') uniq['title'] = U.unique(datasets, column='title') uniq['event_code'] = U.unique(datasets, column='event_code') uniq['event_id'] = U.unique(datasets, column='event_id') uniq['world'] = U.unique(datasets, column='world') uniq['type'] = U.unique(datasets, column='type') uniq['title_world'] = U.unique(datasets, column='title_world') uniq['title_type'] = U.unique(datasets, column='title_type') uniq['world_type'] = U.unique(datasets, column='world_type') asm_datasets = [ds.query('type == "Assessment"') for ds in datasets] uniq['assessment_titles'] = U.unique(asm_datasets, column='title') win_codes = {t: 4100 for t in uniq['title']} win_codes['Bird Measurer (Assessment)'] = 4110 ref_ts = dataset['timestamp'].min() meta = {'win_codes': win_codes, 'ref_ts': ref_ts, **uniq} return U.named_tuple('Meta', **meta)
def session_info(session, meta, test): """Computes information about user's session.""" assert not session.empty, 'Session cannot be empty!' session_type = session['type'].iloc[0] assessment = session_type == 'Assessment' outcomes = attempt_outcomes(session, meta) if assessment else None should_include = ( (assessment and test) or (assessment and (len(session) > 1) and outcomes.total > 0)) duration = session.timestamp.iloc[-1] - session.timestamp.iloc[0] return U.named_tuple( name='Info', installation_id=session['installation_id'].iloc[0], game_session=session['game_session'].iloc[0], session_title=session['title'].iloc[0], session_type=session_type, is_assessment=assessment, should_include=should_include, outcomes=outcomes, duration_seconds=duration.seconds)
def load_meta(key): meta = load(key) return U.named_tuple('Meta', **meta)