Пример #1
0
def dump_train_result(train_id, scenario_tag, result):
    model = result.pop('model', None)
    preprocessor = result.pop('preprocessor', None)
    model_name = 'train_result_model_{}'.format(train_id)
    result_name = 'train_result_meta_{}'.format(train_id)

    dirpath = pathjoin(settings.operation_results_dir, scenario_tag)
    to_pickle(pathjoin(dirpath, '{}.pkl'.format(result_name)),
              result)  # dump result
    if model is not None:
        model.dump(dirpath, model_name)
    if preprocessor is not None:
        preprocessor.dump_with_operation_rule(dirpath, train_id)
Пример #2
0
def load(load_func_kwargs, param):
    logger.debug('bigquery loader invoked.')
    if param['dataset_name'] is None:
        raise ValueError('dataset_name must be set for csv loader')
    fname = param['dataset_name'] + '.csv'
    df = pd_read_csv(pathjoin(settings.datasource_dir, fname),
                     **load_func_kwargs)
    return df
Пример #3
0
def get_preprocessor_for_prediction(scenario_tag, train_id, train_result=None, dirpath=None):
    if train_result is None:
        logger.debug('train_result is None .. load from scenario_tag: {}, train_id: {}'.format(scenario_tag, train_id))
        train_result = get_train_result(scenario_tag=scenario_tag, train_id=train_id)
    preprocessor = get_preprocessor(train_result['preprocessor_config'])
    preprocessor.set_operation_mode('predict')
    if dirpath is None:
        dirpath = pathjoin(settings.operation_results_dir, scenario_tag)
    preprocessor.load_with_operation_rule(dirpath, train_id)
    return preprocessor
Пример #4
0
def dump_predicted_result(predict_id, scenario_tag, dumper_config, df, meta):
    dirpath = pathjoin(settings.operation_results_dir, scenario_tag)
    fname_meta = 'predict_result_meta_{}'.format(predict_id)
    fname = 'predict_result_{}'.format(predict_id)
    dump_result_format = dumper_config['name']
    if dump_result_format == 'csv':
        pd_to_csv(df, pathjoin(dirpath, fname + '.csv'), index=False)
    elif dump_result_format == 'pickle':
        to_pickle(pathjoin(dirpath, fname + '.pkl'), df)
    elif dump_result_format == 'bigquery':
        import pandas_gbq
        destination_table = dumper_config.get('destination_table')
        if destination_table is None:
            raise ValueError(
                'destination_table must be set for bigquery dumper.')
        if dumper_config.get('add_predict_id_enabled', True):
            destination_table += ('_' + predict_id)
        pandas_gbq.to_gbq(df, destination_table,
                          **dumper_config.get('kwargs', {}))
    else:
        raise Exception('invalid format')
    to_pickle(pathjoin(dirpath, fname_meta + '.pkl'), meta)
Пример #5
0
def get_scenario_ids(scenario_tag):
    dirpath = pathjoin(settings.operation_results_dir, scenario_tag)
    filenames = list_directory(dirpath, mode='filename')
    train_regexps = [
        re.search(_train_result_meta_pattern, fn) for fn in filenames
    ]
    train_ids = [tr.group(1) for tr in train_regexps if tr is not None]
    predict_regexps = [
        re.search(_predict_result_meta_pattern, fn) for fn in filenames
    ]
    predict_ids = [tr.group(1) for tr in predict_regexps if tr is not None]
    return {
        'train': train_ids,
        'predict': predict_ids,
    }
Пример #6
0
def load_train_results(scenario_tag='default', train_ids='all'):
    dirpath = pathjoin(settings.operation_results_dir, scenario_tag)
    logger.debug('dirpath: {}'.format(dirpath))
    file_paths = list_directory(dirpath)
    result_paths = [
        fp for fp in file_paths
        if re.search('.+\{}train_result_meta_[0-9]+\.pkl$'.format(os.sep), fp)
        is not None
    ]
    results = [from_pickle(rp) for rp in result_paths]
    if not (isinstance(train_ids, list) or train_ids == 'all'):
        raise ValueError('train_ids must be list type or str "all"')
    if not train_ids == 'all':
        results = [r for r in results if r['id'] in train_ids]

    # convert evaluate result to pandas.DataFrame
    for idx, r in enumerate(results):
        if 'evaluate' in r:
            results[idx]['evaluate']['metrics'] = pd.DataFrame(
                [{o['name']: o['value']
                  for o in met} for met in r['evaluate']['metrics']])
    return results
Пример #7
0
def load_sklearn_model(model, dirpath, model_name):
    model._value = from_pickle(pathjoin(dirpath, '{}.pkl'.format(model_name)))
    if model._value is None:
        raise Exception('load {} failed.'.format(model_name))
    return model
Пример #8
0
def dump_sklearn_model(obj, dirpath, model_name):
    return to_pickle(pathjoin(dirpath, '{}.pkl'.format(model_name)), obj.value)
Пример #9
0
def predict(predict_id,
            scenario_tag,
            method_type='predict',
            dataset_config=None,
            train_id='0',
            dump_result_enabled=False,
            append_evacuated_columns_enabled=False,
            dumper_config={},
            result_target_columns='all',
            result_predict_column='predicted'):
    """
        予測を実行する関数

        :param predict_id: シナリオ中における予測実行の識別子
        :type predict_id: str
        :param scenario_tag: シナリオに付与されるタグ
        :type scenario_tag: str
        :param method_type: 予測のタイプ。設定可能なタイプは `predict` or `predict_proba`
        :type method_type: str
        :param dataset_config: Datasetの設定。:class:`akebono.dataset.get_dataset` の引数。
        :type dataset_config: dict
        :param train_id: 予測で使うモデルのtrain_id
        :type train_id: str
        :param dump_result_enabled: 予測結果の永続化を実行するかのフラグ
        :type dump_result_enabled: bool
        :param append_evacuated_columns_enabled: Dataset中で退避したカラムをpredictの結果に加えるかを決めるフラグ
        :type append_evacuated_columns_enabled: bool
        :param dumper_config: 予測結果の設定。
        :type dumper_config: dict
        :param result_target_columns: 予測結果に含めるべき説明変数のカラム名のリスト。全ての場合は'all'とする
        :type result_target_columns: str or list(str)
        :param result_predict_column: 予測結果が格納されるカラム名
        :type result_predict_column: str
        """
    if dataset_config is None:
        raise ValueError('dataset_config must be set.')

    if dump_result_enabled and 'name' not in dumper_config:
        raise ValueError('`name` key must be contained in dumper_config.')

    train_id = str(train_id)
    tr = get_train_result(scenario_tag=scenario_tag, train_id=train_id)
    model, model_config = get_trained_model(scenario_tag,
                                            train_id,
                                            train_result=tr)

    ret = {
        'type': 'predict',
        'method_type': method_type,
        'dataset_config': dataset_config,
        'train_id': train_id,
        'dump_result_enabled': dump_result_enabled,
        'dumper_config': dumper_config,
        'result_target_columns': result_target_columns,
        'result_predict_column': result_predict_column,
        'train_result': tr,
        'model_config': model_config,
    }

    dataset_config[
        'target_column'] = None  # target_columnがNoneだと、predict用のDatasetが返ってくる
    dataset = get_dataset(dataset_config)
    dirpath = pathjoin(settings.operation_results_dir, scenario_tag)
    preprocessor = get_preprocessor_for_prediction(scenario_tag,
                                                   train_id,
                                                   train_result=tr,
                                                   dirpath=dirpath)

    X = dataset.get_predictor()

    if X.index.size == 0:
        raise EmptyDatasetError('empty record')

    fX, _ = preprocessor.process(X, None)
    gc.collect()

    predict_func = getattr(model, method_type, None)
    if predict_func is None:
        raise Exception('{} is not defined.'.format(method_type))
    rawresult = predict_func(fX)
    gc.collect()
    predict_result = fX.copy()
    if not result_target_columns == 'all':
        if not isinstance(result_target_columns, list):
            raise TypeError('result_target_columns must be list.')
        predict_result = predict_result[result_target_columns]

    # len(rawresult.shape) > 1でもpredict_resultのdfに格納できるようにするためlistにしている
    # そもそもpredict_resultを1つのdfにするべきなのかは考え直しても良いと思う
    predict_result.loc[:, result_predict_column] = list(rawresult)

    if append_evacuated_columns_enabled:
        predict_result = pd.concat([dataset.get_evacuated(), predict_result],
                                   axis=1)

    if dump_result_enabled:
        logger.debug('dump_predicted_result start.')
        dump_predicted_result(predict_id, scenario_tag, dumper_config,
                              predict_result, ret)
        logger.debug('dump_predicted_result done.')

    ret['predict_result'] = predict_result
    return ret
Пример #10
0
 def load(self, dirpath, name):
     self._value.load_model(pathjoin(dirpath, name + '.bin'))
     return self
Пример #11
0
 def dump(self, dirpath, name):
     self.value.save_model(pathjoin(dirpath, name + '.bin'))
     return self
Пример #12
0
def get_dataset(dataset_config):
    """
    Datasetを生成するための関数

    :param dataset_config: Datasetについての設定
    :type dataset_config: dict
    :return: :class:`Dataset` object

    Usage:
        >>> from akebono.dataset import get_dataset
        >>> dataset_config = {
                'loader_config': {
                    'name': '*****@*****.**',
                    'kwargs': {
                        'n_features': 1,
                        'noise': 30.0,
                        'random_state': 0,
                    },
                },
                'target_column': 'target',
                'cache_enabled': False,
            }
        >>> ds = get_dataset(dataset_config)
        >>> ds
        <akebono.dataset.model.Dataset object at 0x11291acc0>
    """

    dataset_name = dataset_config.get('name')
    target_column = dataset_config.get('target_column', 'target')
    cache_enabled = dataset_config.get('cache_enabled', False)
    evacuated_columns = dataset_config.get('evacuated_columns', [])
    if not isinstance(evacuated_columns, list):
        raise TypeError('evacuated_columns must be list.')
    loader_config = dataset_config.get('loader_config')
    if not isinstance(loader_config, dict):
        raise Exception(
            'loader_config must be specified and this type is dict.')
    load_func = loader_config.get('name')
    load_func = _loader_name_alias.get(load_func, load_func)  # エイリアスがあったらそれを使う
    if load_func is None:
        raise Exception('loader_config.name must be specified.')
    load_func = load_object_by_str(load_func)
    load_func_kwargs = Param(loader_config.get('kwargs', {}))
    loader_param = loader_config.get('param', {})
    _reserved_params = (
        'dataset_name',
        'target_column',
    )
    for rp in _reserved_params:
        if rp in loader_param:
            raise KeyError('{} is reserved param.'.format(rp))
    loader_param['dataset_name'] = dataset_name
    loader_param['target_column'] = target_column

    preprocess_func_str = dataset_config.get(
        'preprocess_func', '*****@*****.**')
    preprocess_func_hash = get_hash(preprocess_func_str)
    preprocess_func = load_object_by_str(preprocess_func_str)
    preprocess_func_kwargs = Param(
        dataset_config.get('preprocess_func_kwargs', {}))

    def _core_func():
        return preprocess_func(
            load_func(copy.deepcopy(load_func_kwargs.value), loader_param),
            **copy.copy(preprocess_func_kwargs.value))

    fname = '{}_{}_{}_{}'.format(
        dataset_name, load_func_kwargs.get_hashed_id(length=24),
        preprocess_func_hash[:24],
        preprocess_func_kwargs.get_hashed_id(length=24))
    dataset_loading_cache_enabled = dataset_config.get(
        'dataset_loading_cache_enabled', True)
    if dataset_loading_cache_enabled:
        ds = datasetholder.get(fname)
        if ds is not None:
            logger.debug(
                'dataset_loading_cache enabled .. {} get done.'.format(
                    ds.name))
            return ds

    pkl_fname = fname + '.pkl'
    if cache_enabled:
        if dataset_name is not None:
            logger.info('dataset cache enabled')
            _core_func = cache_located_at(
                pathjoin(settings.cache_dir, pkl_fname))(_core_func)
        else:
            raise Exception(
                'dataset_config.cache_enabled is True, but dataset_config.name is None'
            )

    ds = Dataset(fname, _core_func(), target_column, evacuated_columns)

    if dataset_loading_cache_enabled:
        datasetholder.set(ds)
        logger.debug('dataset_loading_cache enabled .. {} set done.'.format(
            ds.name))
    return ds