示例#1
0
def train(dataset,
          features,
          reg_metric,
          algo='lightgbm',
          n_folds=5,
          config=None):
    models = []
    folds = GroupKFold(n_splits=n_folds)
    groups = dataset['installation_id']
    X = dataset[features].copy()
    y = dataset['accuracy_group']
    oof = np.zeros(X.shape[0], dtype=np.float32)
    cv = OrderedDict()
    model_cls = get_model_class(algo)
    metric = getattr(reg_metric, algo)
    feat_imp = np.zeros(len(features), dtype=np.float32)

    for i, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups), 1):
        U.log(f'Running k-fold {i} of {n_folds}')
        x_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
        x_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        model = model_cls(config or get_default_config(algo))
        model.fit(train_data=(x_trn, y_trn),
                  valid_data=(x_val, y_val),
                  metric=metric)
        oof[val_idx] = model.predict(x_val)
        cv[f'cv_cappa_{i}'] = np.mean(reg_metric(y_val, oof[val_idx]))
        models.append(model)
        feat_imp += model.feature_importances.values

    feat_imp /= n_folds
    feat_imp = pd.Series(OrderedDict(zip(features, feat_imp)))
    return U.named_tuple('Result', models=models, cv=cv, oof=oof, fi=feat_imp)
示例#2
0
def attempt_outcomes(session, meta):
    """Computes how many successful and unsuccessful attempts contains the session."""
    event_code = meta.win_codes.get(session.title.iloc[0], 4100)
    total_attempts = session.query(f'event_code == {event_code}')
    pos = total_attempts.event_data.str.contains('true').sum()
    neg = total_attempts.event_data.str.contains('false').sum()
    summary = dict(pos=pos, neg=neg, total=(pos + neg))
    return U.named_tuple('Trial', **summary)
示例#3
0
    def train(self,
              dataset,
              features,
              fold,
              target='accuracy_group',
              grouping='installation_id',
              config=None):

        assert target not in features
        assert grouping in dataset or grouping is None

        groups = dataset[grouping]
        X = dataset[features]
        y = dataset[target]
        model_cls = get_model_class(self.algo)
        n_folds = fold.get_n_splits()

        models = []
        feat_imp = np.zeros(len(features), dtype=np.float32)
        oof = np.zeros(X.shape[0], dtype=np.float32)
        cv = OrderedDict()

        for i, (trn_idx, val_idx) in enumerate(fold.split(X, y, groups), 1):
            U.log(f'Running k-fold {i} of {n_folds}')
            x_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
            x_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
            model = model_cls(config or get_default_config(self.algo))
            model.fit(train_data=(x_trn, y_trn),
                      valid_data=(x_val, y_val),
                      metric=self.eval_metric)
            oof[val_idx] = model.predict(x_val)
            for name, metric in self.cv_metrics.items():
                cv[f'cv_{name}_{i}'] = metric(y_val, oof[val_idx])
            models.append(model)
            if model.has_feature_importance:
                feat_imp += model.feature_importances.values

        if cv:
            U.log('Fold evaluation results:')
            U.log(U.dict_format(cv))

        feat_imp /= n_folds
        feat_imp = pd.Series(OrderedDict(zip(features, feat_imp)))
        return U.named_tuple('Result',
                             models=models,
                             cv=cv,
                             oof=oof,
                             fi=feat_imp)
示例#4
0
文件: meta.py 项目: devforfu/bowl2019
def compute_meta_data(dataset, *datasets):
    datasets = [dataset] + list(datasets)
    uniq = OrderedDict()
    uniq['title_event_code'] = U.unique(datasets, column='title_event_code')
    uniq['title'] = U.unique(datasets, column='title')
    uniq['event_code'] = U.unique(datasets, column='event_code')
    uniq['event_id'] = U.unique(datasets, column='event_id')
    uniq['world'] = U.unique(datasets, column='world')
    uniq['type'] = U.unique(datasets, column='type')
    uniq['title_world'] = U.unique(datasets, column='title_world')
    uniq['title_type'] = U.unique(datasets, column='title_type')
    uniq['world_type'] = U.unique(datasets, column='world_type')
    asm_datasets = [ds.query('type == "Assessment"') for ds in datasets]
    uniq['assessment_titles'] = U.unique(asm_datasets, column='title')
    win_codes = {t: 4100 for t in uniq['title']}
    win_codes['Bird Measurer (Assessment)'] = 4110
    ref_ts = dataset['timestamp'].min()
    meta = {'win_codes': win_codes, 'ref_ts': ref_ts, **uniq}
    return U.named_tuple('Meta', **meta)
示例#5
0
def session_info(session, meta, test):
    """Computes information about user's session."""
    assert not session.empty, 'Session cannot be empty!'
    session_type = session['type'].iloc[0]
    assessment = session_type == 'Assessment'
    outcomes = attempt_outcomes(session, meta) if assessment else None
    should_include = (
        (assessment and test) or
        (assessment and (len(session) > 1) and outcomes.total > 0))
    duration = session.timestamp.iloc[-1] - session.timestamp.iloc[0]
    return U.named_tuple(
        name='Info', 
        installation_id=session['installation_id'].iloc[0],
        game_session=session['game_session'].iloc[0],
        session_title=session['title'].iloc[0],
        session_type=session_type,
        is_assessment=assessment,
        should_include=should_include,
        outcomes=outcomes,
        duration_seconds=duration.seconds)
示例#6
0
def load_meta(key):
    meta = load(key)
    return U.named_tuple('Meta', **meta)