Пример #1
0
def main():
    X, y = utils.load_10x(path('SC02'), 'SC02')
    best_model = catboost.CatBoostClassifier(
        l2_leaf_reg=3,
        learning_rate=0.445,
        depth=10,
        iterations=200,
        random_seed=42,
        logging_level='Silent',
        loss_function='MultiClass',
        eval_metric='TotalF1',
        thread_count=20,
    )
    model_path = os.path.join(CUR_DIR, 'sc02-best-model.cbm')
    if os.path.exists(model_path):
        best_model.load_model(model_path)
    else:
        best_model.fit(X, y, [X.shape[1] - 1])
        best_model.save_model(model_path)

    sc03, _ = utils.load_10x(path('SC03'), 'SC03')
    sc03_preds = predict(best_model, X.columns, sc03)
    sc03_preds.to_csv(os.path.join(CUR_DIR, 'sc03-preds.csv'))

    importances = pd.DataFrame(best_model._feature_importance, X.columns)
    importances[importances[0] > 0].sort_values(0, ascending=False).to_csv(
        os.path.join(CUR_DIR, 'sc02-model-features.csv'))
Пример #2
0
def process():
    X, _ = utils.load_10x(os.path.join(ROOT, 'SC01'), 'SC01v2')
    X.drop(columns=['Batch'], inplace=True)

    predictions = pd.read_csv(os.path.join(CUR_DIR, 'sc01-best-preds.csv'),
                              index_col=0)
    predictions.index = predictions.index.str.replace('-1', '')
    y = predictions.loc[X.index, :].max(axis=1)

    params_space = {
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        'learning_rate': np.linspace(1e-3, 8e-1, num=10),
        'depth': [6, 8, 10],
    }

    score, best_params = find_best_params(X, y, params_space)
    print('Best params score: {}'.format(score))
    print('Best params: {}'.format(repr(best_params)))

    model = catboost.CatBoostRegressor(
        iterations=500,
        learning_rate=best_params['learning_rate'],
        depth=best_params['depth'],
        l2_leaf_reg=best_params['l2_leaf_reg'],
        random_seed=42,
        logging_level='Silent',
        thread_count=20,
        loss_function='MAE',
    )
    model.fit(X, y)
    model.save_model(os.path.join(CUR_DIR, 'sc01-pred-score.cbm'))

    importances = pd.DataFrame(model._feature_importance, X.columns)
    importances[importances[0] > 0].sort_values(0, ascending=False).to_csv(
        os.path.join(CUR_DIR, 'sc01-pred-score-features.csv'))
Пример #3
0
def main():
    X, _ = utils.load_10x(path('SC02'), 'SC02v2')
    best_model = catboost.CatBoostClassifier(
        l2_leaf_reg=7,
        learning_rate=0.622,
        depth=10,
        iterations=200,
        random_seed=42,
        logging_level='Silent',
        loss_function='MultiClass',
        eval_metric='TotalF1',
        thread_count=20,
    )
    model_path = os.path.join(CUR_DIR, 'sc02v2-best-model.cbm')
    best_model.load_model(model_path)

    sc01, _ = utils.load_10x(path('SC01'), 'SC01v2')
    sc01_preds = predict(best_model, X.columns, sc01)
    sc01_preds.to_csv(os.path.join(CUR_DIR, 'sc01-preds.csv'))
Пример #4
0
def main():
    runs = []
    for i in range(1, 6):
        runs.append(
            pd.read_csv(os.path.join(CUR_DIR,
                                     './search-{}-of-5.csv'.format(i)),
                        index_col=0))
    runs = pd.concat(runs, ignore_index=True)
    best_row = runs.score.idxmax(axis=0)
    best_params = eval(runs.params[best_row])

    X, y = utils.load_10x(path('SC03'), 'SC03')
    best_model = catboost.CatBoostClassifier(
        l2_leaf_reg=best_params['l2_leaf_reg'],
        learning_rate=best_params['learning_rate'],
        depth=best_params['depth'],
        iterations=200,
        random_seed=42,
        logging_level='Silent',
        loss_function='MultiClass',
        eval_metric='TotalF1',
        thread_count=20,
    )
    model_path = os.path.join(CUR_DIR, 'sc03-model.cbm')
    if os.path.exists(model_path):
        best_model.load_model(model_path)
    else:
        best_model.fit(X, y, [X.shape[1] - 1])
        best_model.save_model(model_path)
        importances = pd.DataFrame(best_model._feature_importance, X.columns)
        importances[importances[0] > 0].sort_values(0, ascending=False).to_csv(
            os.path.join(CUR_DIR, 'sc03-features.csv'))

    sc01, _ = utils.load_10x(path('SC01'), 'SC01v2')
    sc01_preds = predict(best_model, X.columns, sc01)
    sc01_preds.to_csv(os.path.join(CUR_DIR, 'sc01-preds.csv'))

    sc02, _ = utils.load_10x(path('SC02'), 'SC02v2')
    sc02_preds = predict(best_model, X.columns, sc02)
    sc02_preds.to_csv(os.path.join(CUR_DIR, 'sc02-preds.csv'))
Пример #5
0
def main(splits, current_split):
    X, y = utils.load_10x(path('SC02'), 'SC02v2')
    X_train, X_test, y_train, y_test =  train_test_split(
        X, y, test_size=0.1, stratify=y, random_state=42,
    )
    params_space = {
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        'learning_rate': np.linspace(1e-3, 8e-1, num=10),
        'depth': [6, 8, 10],
    }
    score, best_params = catboost_GridSearchCV(X_train, y_train, params_space, cv=5,
                                               splits=splits, current_split=current_split)
    print(score)
    print(best_params)
Пример #6
0
def main(splits, current_split):
    X, y = utils.load_10x(path('SC03'), 'SC03')
    params_space = {
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        'learning_rate': np.linspace(1e-3, 8e-1, num=10),
        'depth': [6, 8, 10],
    }
    record = []
    score, best_params = catboost_GridSearchCV(X,
                                               y,
                                               params_space,
                                               record,
                                               cv=5,
                                               splits=splits,
                                               current_split=current_split)
    pd.DataFrame(record, columns=['params', 'score', 'time']).to_csv(
        os.path.join(CUR_DIR,
                     'search-{}-of-{}.csv'.format(current_split, splits)))
Пример #7
0
def get_query():
    global _query
    if '_query' not in globals():
        _query = utils.load_10x(os.path.join(ROOT, 'SC03'), 'SC03')
    return _query
Пример #8
0
def get_reference():
    global _reference
    if '_reference' not in globals():
        _reference = utils.load_10x(os.path.join(ROOT, 'SC02'), 'SC02v2')
    return _reference