示例#1
0
def test_dask_universal_read_with_columns_info():
    pandas_df_float32 = pandas.read_csv('tests/_data/iris.csv', dtype=numpy.float32)
    dask_df = readers.dask_universal_read(
            'tests/_data/iris.csv',
            columns_info=[
                    BaseDataTransformationColumn(
                            id=index,
                            name=name,
                            data_format=DataFormats.numerical,
                            data_type=DataTypes.continuous
                        ) for index, name in enumerate(pandas_df_float32.columns)
                ]
        )
    dtypes = pandas.Series(
            data=[numpy.float32] * 5,
            index=pandas.Index(range(len(pandas_df_float32.columns)))
        )
    assert all(dask_df.dtypes == dtypes)
    assert all(dask_df.compute()[3] == pandas_df_float32['petal_width_cm'])
示例#2
0
def test_dask_universal_read_with_categorical():
    columns_info = [
            BaseDataTransformationColumn(
                    id=0,
                    name='sepal_length_cm',
                    data_format=DataFormats.numerical
                ),
            BaseDataTransformationColumn(
                    id=1,
                    name='sepal_width_cm',
                    data_format=DataFormats.numerical
                ),
            BaseDataTransformationColumn(
                    id=2,
                    name='petal_length_cm',
                    data_format=DataFormats.numerical
                ),
            BaseDataTransformationColumn(
                    id=3,
                    name='petal_width_cm',
                    data_format=DataFormats.numerical,
                    data_type=DataTypes.categorical,
                    statistics={
                            'uniques_stats': [
                                    (0.1, 1),
                                    (0.2, 1),
                                    (0.3, 1),
                                    (0.4, 1),
                                    (0.5, 1),
                                    (0.6, 1),
                                    (1.0, 1),
                                    (1.1, 1),
                                    (1.2, 1),
                                    (1.3, 1),
                                    (1.4, 1),
                                    (1.5, 1),
                                    (1.6, 1),
                                    (1.7, 1),
                                    (1.8, 1),
                                    (1.9, 1),
                                    (2.0, 1),
                                    (2.1, 1),
                                    (2.2, 1),
                                    (2.3, 1),
                                    (2.4, 1),
                                    (2.5, 1),
                                ]
                        }
                ),
            BaseDataTransformationColumn(
                    id=4,
                    name='class',
                    data_format=DataFormats.character,
                    data_type=DataTypes.categorical,
                    statistics={
                            'uniques_stats': [
                                    ('0', 1),
                                    ('1', 1),
                                    ('2', 1),
                                ]
                        }
                ),
        ]
    dask_df = readers.dask_universal_read('tests/_data/iris.csv', columns_info=columns_info)
    df_with_replaced_categories = dask_df.compute()
    pandas_df_with_category_dtype = pandas.read_csv(
            'tests/_data/iris.csv',
            dtype={
                    column.name: (
                            numpy.float32
                            if column.data_format is DataFormats.numerical else
                            numpy.int16
                        ) for column in columns_info
                }
        )
    assert all(dask_df.dtypes.values == pandas_df_with_category_dtype.dtypes.values)
    assert all(
            df_with_replaced_categories[3].values ==
            pandas_df_with_category_dtype['petal_width_cm'].values
        )
    assert all(
            df_with_replaced_categories[4].values ==
            pandas_df_with_category_dtype['class'].astype(numpy.int16)
        )
示例#3
0
def test_dask_universal_read_into_strings():
    pandas_df = pandas.read_csv('tests/_data/iris.csv', dtype=str)
    dask_df = readers.dask_universal_read('tests/_data/iris.csv', columns_info=None)
    assert all(dask_df.dtypes.values == numpy.dtype('O'))
    assert all(dask_df.compute()['petal_width_cm'] == pandas_df['petal_width_cm'])
示例#4
0
def build_predictive_model(learn_dataset_url, dataset_transformations,
                           predictive_analysis_method_name,
                           predictive_analysis_options, predictive_model_id):
    """
    This function is building predictive model, saves it to SeaWeedFS and patching through API
    the predictive model by it's id, putting there a seaweedfs' id to model and to model info.

    Args:
        learn_dataset_url (str): link to a learn dataset.
        dataset_transformations (list): a list of data transformation ids of
            the learn dataset in PFA format.
        predictive_analysis_method_name (str): name of the predictive method
            that is requested to be used.
        predictive_analysis_options (dict): kwargs to predictive analysis
            method.
        predictive_model_id (int): id of the model in API to patch it
    """
    log.info("New %s model #%d is going to be built...",
             predictive_analysis_method_name, predictive_model_id)

    initial_columns_info = cloudsml.data_api.get_data_transformation_columns(
        dataset_transformations[-1],
        initial=True,
        # XXX: We must read the list in batches when the number of columns exceeds 1000
        limit=1000)

    learn_dataset_df = dask_universal_read(learn_dataset_url,
                                           columns_info=initial_columns_info)
    learn_dataset_df = transform_dask_dataframe(
        learn_dataset_df,
        fetch_data_transformations_by_id_in_pfa(dataset_transformations))

    target_column_id = predictive_analysis_options['target_column_id']
    feature_column_ids = predictive_analysis_options['feature_column_ids']
    selected_columns_info = {
            column.id: column \
                for column in cloudsml.data_api.get_data_transformation_columns(
                        dataset_transformations[-1],
                        id=([target_column_id] + feature_column_ids),
                        # XXX: API server limits the maximum possible limit of columns per single
                        # request at 1000 to avoid too long response times. Thus, we must implement
                        # querying in the columns info in batches. Yet, this might be hidden behind
                        # a convinient wrapper.
                        limit=1000
                    )
        }
    learn_dataset_df = learn_dataset_df[sorted(selected_columns_info.keys())]

    missing_values_encoder = missing_values_encoding.missing_values_encoder
    learn_dataset_df, missing_values_substitution_map = missing_values_encoder(
        learn_dataset_df, selected_columns_info)

    learn_dataset_df, selected_columns_info = one_hot_encoding.OneHotEncoder(
        categorical_columns_ids=predictive_analysis_options[
            'categorical_column_ids'],
        columns_info=selected_columns_info).update(learn_dataset_df,
                                                   selected_columns_info)

    test_partition_ratio = predictive_analysis_options.get(
        'test_partition_ratio', 0.4)
    test_learn_splitter = SplitSampling(test_partition_ratio, random_state=0)
    test_dataset_df, learn_dataset_df = test_learn_splitter.split(
        learn_dataset_df)

    predictive_analysis_method = PREDICTIVE_ANALYSIS_METHODS[
        predictive_analysis_method_name]
    log.info('Model #%d is being fitted with data...', predictive_model_id)
    model = predictive_analysis_method(learn_dataset_df,
                                       columns_info=selected_columns_info,
                                       **predictive_analysis_options)

    log.info('Model #%d is being exported to PFA...', predictive_model_id)
    one_hot_pfa_decoder = OneHotPFADecoder({
            column.id: column.virtual_columns \
                for column in selected_columns_info.values() \
                    if hasattr(column, 'virtual_columns')
        })
    missing_values_pfa_decoder = MissingValuesPFADecoder(
        missing_values_substitution_map)

    translated_model = missing_values_pfa_decoder.transform(
        one_hot_pfa_decoder.transform(model.to_pfa()))

    model_file_id = seaweedfs.upload_file(stream=json.dumps(translated_model),
                                          name='model_%s.pfa' %
                                          predictive_model_id)

    log.info('Model #%d information is being collected...',
             predictive_model_id)
    model_info = {
        'learn': model.get_info(learn_dataset_df),
    }
    if test_partition_ratio > 0.0:
        model_info['test'] = model.get_info(test_dataset_df)

    model_info = ModelInfoShema().load({'performance_stats': model_info}).data

    model_info_id = seaweedfs.upload_file(stream=json.dumps(model_info),
                                          name='model_info_%s.json' %
                                          predictive_model_id)

    cloudsml.predictive_analytics_api.patch_predictive_model_by_id(
        predictive_model_id,
        [
            {
                "op": "replace",
                "path": "/model_seaweed_id",
                "value": model_file_id
            },
            {
                "op": "replace",
                "path": "/model_info_seaweed_id",
                "value": model_info_id
            },
            {
                "op": "replace",
                "path": "/status",
                "value": "fitted"
            },  # TODO use constant here
        ])