示例#1
0
def test_generator_load_iris(pd_assert_equal):
    dataset_config = {
        'loader_config': {
            'name': 'iris',
            'kwargs': {},
        },
        'target_column': 'target',
    }
    r = get_dataset(dataset_config)
    X, y = r.get_predictor_target()
    r_raw = skl_datasets.load_iris(**dataset_config['loader_config']['kwargs'])
    X_raw, y_raw = r_raw.data, r_raw.target
    pd_assert_equal(y, pd.Series(y_raw))
    pd_assert_equal(X, pd.DataFrame(X_raw, columns=X.columns))
示例#2
0
def test_generator_make_moons(pd_assert_equal):
    dataset_config = {
        'loader_config': {
            'name': 'binary_classifier_sample_moon',
            'kwargs': {
                'random_state': 0,
            },
        },
        'target_column': 'target',
    }
    r = get_dataset(dataset_config)
    X, y = r.get_predictor_target()
    r_raw = skl_datasets.make_moons(**dataset_config['loader_config']['kwargs'])
    X_raw, y_raw = r_raw
    pd_assert_equal(y, pd.Series(y_raw))
    pd_assert_equal(X, pd.DataFrame(X_raw, columns=X.columns))
示例#3
0
def test_generator_make_regression(pd_assert_equal):
    dataset_config = {
        'loader_config': {
            'name': 'regression_sample',
            'kwargs': {
                'random_state': 0,
                'coef': False,
            },
        },
        'target_column': 'target',
    }
    r = get_dataset(dataset_config)
    X, y = r.get_predictor_target()
    r_raw = skl_datasets.make_regression(**dataset_config['loader_config']['kwargs'])
    X_raw, y_raw = r_raw
    pd_assert_equal(y, pd.Series(y_raw))
    pd_assert_equal(X, pd.DataFrame(X_raw, columns=X.columns))
示例#4
0
def test_model_regressor(pd_assert_equal):
    init_kwargs_settings = [
        {},
        {
            'normalize': True
        },
    ]
    fit_kwargs_settings = [{}]
    evaluate_kwargs_settings = [{
        'train_test_split_func': '[email protected]_selection',
        'train_test_split_func_kwargs': {
            'random_state': 0
        },
        'metrics': 'all',
    }]
    dataset_config = {
        'loader_config': {
            'name': '*****@*****.**',
            'kwargs': {
                'random_state': 0,
            },
        },
        'target_column': 'target',
    }
    preprocessor = get_preprocessor({
        'name': 'identify',
        'kwargs': {},
    })
    ffunc_for_predictor = ffunc_for_target = load_object_by_str(
        '*****@*****.**')
    ds1 = get_dataset(dataset_config)
    X1, y1 = ds1.get_predictor_target()

    for init_kwargs in init_kwargs_settings:
        for fit_kwargs in fit_kwargs_settings:
            for evaluate_kwargs in evaluate_kwargs_settings:
                mconfig = {
                    'name': 'SklearnLinearRegression',
                    'init_kwargs': init_kwargs,
                    'fit_kwargs': fit_kwargs,
                    'evaluate_kwargs': evaluate_kwargs,
                    'is_rebuild': False,
                }
                m = get_model(mconfig)
                morigin = LinearRegression(**init_kwargs)
                m.fit(X1, y1)
                morigin.fit(X1, y1, **fit_kwargs)

                # assertion
                pd_assert_equal(
                    pd.Series(m.predict(X1)).astype('float64'),
                    pd.Series(morigin.predict(X1)).astype('float64'))
                X_train, X_test, y_train, y_test = skl_train_test_split(
                    X1, y1, **evaluate_kwargs['train_test_split_func_kwargs'])
                rev1 = m.evaluate(X1, y1, preprocessor, ffunc_for_predictor,
                                  ffunc_for_target)
                if not rev1['cv']:
                    met = rev1['metrics'][0]
                    mean_absolute_error = [
                        o['value'] for o in met
                        if o['name'] == 'mean_absolute_error'
                    ][0]
                    mean_squared_error = [
                        o['value'] for o in met
                        if o['name'] == 'mean_squared_error'
                    ][0]
                    median_absolute_error = [
                        o['value'] for o in met
                        if o['name'] == 'median_absolute_error'
                    ][0]
                    r2_score = [
                        o['value'] for o in met if o['name'] == 'r2_score'
                    ][0]
                    explained_variance = [
                        o['value'] for o in met
                        if o['name'] == 'explained_variance'
                    ][0]
                    morigin.fit(ffunc_for_predictor(X_train),
                                ffunc_for_target(y_train))
                    assert math.fabs(
                        mean_absolute_error - skl_metrics.mean_absolute_error(
                            y_test, morigin.predict(X_test))) < 0.00001
                    assert math.fabs(
                        mean_squared_error - skl_metrics.mean_squared_error(
                            y_test, morigin.predict(X_test))) < 0.00001
                    assert math.fabs(
                        median_absolute_error -
                        skl_metrics.median_absolute_error(
                            y_test, morigin.predict(X_test))) < 0.00001
                    assert math.fabs(r2_score - skl_metrics.r2_score(
                        y_test, morigin.predict(X_test))) < 0.00001
                    assert math.fabs(
                        explained_variance -
                        skl_metrics.explained_variance_score(
                            y_test, morigin.predict(X_test))) < 0.00001
示例#5
0
def train(train_id,
          scenario_tag,
          dataset_config=None,
          model_config=None,
          preprocessor_config=None,
          formatter_config_for_predictor=None,
          formatter_config_for_target=None,
          evaluate_enabled=False,
          fit_model_enabled=False,
          dump_result_enabled=False):
    """
        モデルの訓練を実行する関数

        :param train_id: シナリオ中における訓練実行の識別子
        :type train_id: str
        :param scenario_tag: シナリオに付与されるタグ
        :type scenario_tag: str
        :param dataset_config: Datasetの設定。:class:`akebono.dataset.get_dataset` の引数。
        :type dataset_config: dict
        :param model_config: Modelの設定。:class:`akebono.model.get_model` の引数。
        :type model_config: dict
        :param preprocessor_config: Preprocessorの設定。:class:`akebono.preprocessor.get_preprocessor` の引数。
        :type preprocessor_config: dict
        :param formatter_config_for_predictor: 特徴用Formatterの設定。
        :type formatter_config_for_predictor: dict
        :param formatter_config_for_target: 目標用Formatterの設定。
        :type formatter_config_for_target: dict
        :param evaluate_enabled: モデルの評価を実行するかのフラグ
        :type evaluate_enabled: bool
        :param fit_model_enabled: モデルの訓練を実行するかのフラグ
        :type fit_model_enabled: bool
        :param dump_result_enabled: モデル、評価結果の永続化を実行するかのフラグ
        :type dump_result_enabled: bool
        """
    if model_config is None:
        raise ValueError('model_config must be set.')
    if dataset_config is None:
        raise ValueError('dataset_config must be set.')
    if preprocessor_config is None:
        preprocessor_config = {
            'name': 'identify',
            'kwargs': {},
        }
    if formatter_config_for_predictor is None:
        formatter_config_for_predictor = {
            'name': '*****@*****.**',
        }
    if formatter_config_for_target is None:
        formatter_config_for_target = {
            'name': '*****@*****.**',
        }

    ret = {
        'type': 'train',
        'id': train_id,
        'dataset_config': dataset_config,
        'model_config': model_config,
        'preprocessor_config': preprocessor_config,
        'formatter_config_for_predictor': formatter_config_for_predictor,
        'formatter_config_for_target': formatter_config_for_target,
        'evaluate_enabled': evaluate_enabled,
        'fit_model_enabled': fit_model_enabled,
        'dump_result_enabled': dump_result_enabled
    }

    dataset = get_dataset(dataset_config)

    preprocessor = get_preprocessor(preprocessor_config)
    preprocessor.set_operation_mode('train')
    logger.debug('load dataset start.')
    X, y = dataset.get_predictor_target()

    if X.index.size == 0:
        raise EmptyDatasetError('empty record')

    logger.debug('load dataset done.')

    model_config['is_rebuild'] = False
    model = get_model(model_config)

    format_func_for_predictor = load_object_by_str(
        formatter_config_for_predictor['name'])
    format_func_for_target = load_object_by_str(
        formatter_config_for_target['name'])

    if evaluate_enabled:
        logger.debug('evaluate start.')
        rep = model.evaluate(X, y, preprocessor, format_func_for_predictor,
                             format_func_for_target)
        gc.collect()
        logger.debug('evaluate done.')
        ret['evaluate'] = rep
    if fit_model_enabled:
        logger.debug('fit start.')
        fX_p, _ = preprocessor.process(X, None)
        fX = format_func_for_predictor(fX_p)
        model.fit(fX, format_func_for_target(y))
        gc.collect()
        logger.debug('fit done.')
        ret['model'] = model
    if dump_result_enabled:
        logger.debug('dump_train_result start.')
        ret['preprocessor'] = preprocessor
        dump_train_result(train_id, scenario_tag, ret)
        logger.debug('dump_train_result done.')

    return ret
示例#6
0
def predict(predict_id,
            scenario_tag,
            method_type='predict',
            dataset_config=None,
            train_id='0',
            dump_result_enabled=False,
            append_evacuated_columns_enabled=False,
            dumper_config={},
            result_target_columns='all',
            result_predict_column='predicted'):
    """
        予測を実行する関数

        :param predict_id: シナリオ中における予測実行の識別子
        :type predict_id: str
        :param scenario_tag: シナリオに付与されるタグ
        :type scenario_tag: str
        :param method_type: 予測のタイプ。設定可能なタイプは `predict` or `predict_proba`
        :type method_type: str
        :param dataset_config: Datasetの設定。:class:`akebono.dataset.get_dataset` の引数。
        :type dataset_config: dict
        :param train_id: 予測で使うモデルのtrain_id
        :type train_id: str
        :param dump_result_enabled: 予測結果の永続化を実行するかのフラグ
        :type dump_result_enabled: bool
        :param append_evacuated_columns_enabled: Dataset中で退避したカラムをpredictの結果に加えるかを決めるフラグ
        :type append_evacuated_columns_enabled: bool
        :param dumper_config: 予測結果の設定。
        :type dumper_config: dict
        :param result_target_columns: 予測結果に含めるべき説明変数のカラム名のリスト。全ての場合は'all'とする
        :type result_target_columns: str or list(str)
        :param result_predict_column: 予測結果が格納されるカラム名
        :type result_predict_column: str
        """
    if dataset_config is None:
        raise ValueError('dataset_config must be set.')

    if dump_result_enabled and 'name' not in dumper_config:
        raise ValueError('`name` key must be contained in dumper_config.')

    train_id = str(train_id)
    tr = get_train_result(scenario_tag=scenario_tag, train_id=train_id)
    model, model_config = get_trained_model(scenario_tag,
                                            train_id,
                                            train_result=tr)

    ret = {
        'type': 'predict',
        'method_type': method_type,
        'dataset_config': dataset_config,
        'train_id': train_id,
        'dump_result_enabled': dump_result_enabled,
        'dumper_config': dumper_config,
        'result_target_columns': result_target_columns,
        'result_predict_column': result_predict_column,
        'train_result': tr,
        'model_config': model_config,
    }

    dataset_config[
        'target_column'] = None  # target_columnがNoneだと、predict用のDatasetが返ってくる
    dataset = get_dataset(dataset_config)
    dirpath = pathjoin(settings.operation_results_dir, scenario_tag)
    preprocessor = get_preprocessor_for_prediction(scenario_tag,
                                                   train_id,
                                                   train_result=tr,
                                                   dirpath=dirpath)

    X = dataset.get_predictor()

    if X.index.size == 0:
        raise EmptyDatasetError('empty record')

    fX, _ = preprocessor.process(X, None)
    gc.collect()

    predict_func = getattr(model, method_type, None)
    if predict_func is None:
        raise Exception('{} is not defined.'.format(method_type))
    rawresult = predict_func(fX)
    gc.collect()
    predict_result = fX.copy()
    if not result_target_columns == 'all':
        if not isinstance(result_target_columns, list):
            raise TypeError('result_target_columns must be list.')
        predict_result = predict_result[result_target_columns]

    # len(rawresult.shape) > 1でもpredict_resultのdfに格納できるようにするためlistにしている
    # そもそもpredict_resultを1つのdfにするべきなのかは考え直しても良いと思う
    predict_result.loc[:, result_predict_column] = list(rawresult)

    if append_evacuated_columns_enabled:
        predict_result = pd.concat([dataset.get_evacuated(), predict_result],
                                   axis=1)

    if dump_result_enabled:
        logger.debug('dump_predicted_result start.')
        dump_predicted_result(predict_id, scenario_tag, dumper_config,
                              predict_result, ret)
        logger.debug('dump_predicted_result done.')

    ret['predict_result'] = predict_result
    return ret