def test_dask_universal_read_with_columns_info(): pandas_df_float32 = pandas.read_csv('tests/_data/iris.csv', dtype=numpy.float32) dask_df = readers.dask_universal_read( 'tests/_data/iris.csv', columns_info=[ BaseDataTransformationColumn( id=index, name=name, data_format=DataFormats.numerical, data_type=DataTypes.continuous ) for index, name in enumerate(pandas_df_float32.columns) ] ) dtypes = pandas.Series( data=[numpy.float32] * 5, index=pandas.Index(range(len(pandas_df_float32.columns))) ) assert all(dask_df.dtypes == dtypes) assert all(dask_df.compute()[3] == pandas_df_float32['petal_width_cm'])
def test_dask_universal_read_with_categorical(): columns_info = [ BaseDataTransformationColumn( id=0, name='sepal_length_cm', data_format=DataFormats.numerical ), BaseDataTransformationColumn( id=1, name='sepal_width_cm', data_format=DataFormats.numerical ), BaseDataTransformationColumn( id=2, name='petal_length_cm', data_format=DataFormats.numerical ), BaseDataTransformationColumn( id=3, name='petal_width_cm', data_format=DataFormats.numerical, data_type=DataTypes.categorical, statistics={ 'uniques_stats': [ (0.1, 1), (0.2, 1), (0.3, 1), (0.4, 1), (0.5, 1), (0.6, 1), (1.0, 1), (1.1, 1), (1.2, 1), (1.3, 1), (1.4, 1), (1.5, 1), (1.6, 1), (1.7, 1), (1.8, 1), (1.9, 1), (2.0, 1), (2.1, 1), (2.2, 1), (2.3, 1), (2.4, 1), (2.5, 1), ] } ), BaseDataTransformationColumn( id=4, name='class', data_format=DataFormats.character, data_type=DataTypes.categorical, statistics={ 'uniques_stats': [ ('0', 1), ('1', 1), ('2', 1), ] } ), ] dask_df = readers.dask_universal_read('tests/_data/iris.csv', columns_info=columns_info) df_with_replaced_categories = dask_df.compute() pandas_df_with_category_dtype = pandas.read_csv( 'tests/_data/iris.csv', dtype={ column.name: ( numpy.float32 if column.data_format is DataFormats.numerical else numpy.int16 ) for column in columns_info } ) assert all(dask_df.dtypes.values == pandas_df_with_category_dtype.dtypes.values) assert all( df_with_replaced_categories[3].values == pandas_df_with_category_dtype['petal_width_cm'].values ) assert all( df_with_replaced_categories[4].values == pandas_df_with_category_dtype['class'].astype(numpy.int16) )
def test_dask_universal_read_into_strings(): pandas_df = pandas.read_csv('tests/_data/iris.csv', dtype=str) dask_df = readers.dask_universal_read('tests/_data/iris.csv', columns_info=None) assert all(dask_df.dtypes.values == numpy.dtype('O')) assert all(dask_df.compute()['petal_width_cm'] == pandas_df['petal_width_cm'])
def build_predictive_model(learn_dataset_url, dataset_transformations, predictive_analysis_method_name, predictive_analysis_options, predictive_model_id): """ This function is building predictive model, saves it to SeaWeedFS and patching through API the predictive model by it's id, putting there a seaweedfs' id to model and to model info. Args: learn_dataset_url (str): link to a learn dataset. dataset_transformations (list): a list of data transformation ids of the learn dataset in PFA format. predictive_analysis_method_name (str): name of the predictive method that is requested to be used. predictive_analysis_options (dict): kwargs to predictive analysis method. predictive_model_id (int): id of the model in API to patch it """ log.info("New %s model #%d is going to be built...", predictive_analysis_method_name, predictive_model_id) initial_columns_info = cloudsml.data_api.get_data_transformation_columns( dataset_transformations[-1], initial=True, # XXX: We must read the list in batches when the number of columns exceeds 1000 limit=1000) learn_dataset_df = dask_universal_read(learn_dataset_url, columns_info=initial_columns_info) learn_dataset_df = transform_dask_dataframe( learn_dataset_df, fetch_data_transformations_by_id_in_pfa(dataset_transformations)) target_column_id = predictive_analysis_options['target_column_id'] feature_column_ids = predictive_analysis_options['feature_column_ids'] selected_columns_info = { column.id: column \ for column in cloudsml.data_api.get_data_transformation_columns( dataset_transformations[-1], id=([target_column_id] + feature_column_ids), # XXX: API server limits the maximum possible limit of columns per single # request at 1000 to avoid too long response times. Thus, we must implement # querying in the columns info in batches. Yet, this might be hidden behind # a convinient wrapper. limit=1000 ) } learn_dataset_df = learn_dataset_df[sorted(selected_columns_info.keys())] missing_values_encoder = missing_values_encoding.missing_values_encoder learn_dataset_df, missing_values_substitution_map = missing_values_encoder( learn_dataset_df, selected_columns_info) learn_dataset_df, selected_columns_info = one_hot_encoding.OneHotEncoder( categorical_columns_ids=predictive_analysis_options[ 'categorical_column_ids'], columns_info=selected_columns_info).update(learn_dataset_df, selected_columns_info) test_partition_ratio = predictive_analysis_options.get( 'test_partition_ratio', 0.4) test_learn_splitter = SplitSampling(test_partition_ratio, random_state=0) test_dataset_df, learn_dataset_df = test_learn_splitter.split( learn_dataset_df) predictive_analysis_method = PREDICTIVE_ANALYSIS_METHODS[ predictive_analysis_method_name] log.info('Model #%d is being fitted with data...', predictive_model_id) model = predictive_analysis_method(learn_dataset_df, columns_info=selected_columns_info, **predictive_analysis_options) log.info('Model #%d is being exported to PFA...', predictive_model_id) one_hot_pfa_decoder = OneHotPFADecoder({ column.id: column.virtual_columns \ for column in selected_columns_info.values() \ if hasattr(column, 'virtual_columns') }) missing_values_pfa_decoder = MissingValuesPFADecoder( missing_values_substitution_map) translated_model = missing_values_pfa_decoder.transform( one_hot_pfa_decoder.transform(model.to_pfa())) model_file_id = seaweedfs.upload_file(stream=json.dumps(translated_model), name='model_%s.pfa' % predictive_model_id) log.info('Model #%d information is being collected...', predictive_model_id) model_info = { 'learn': model.get_info(learn_dataset_df), } if test_partition_ratio > 0.0: model_info['test'] = model.get_info(test_dataset_df) model_info = ModelInfoShema().load({'performance_stats': model_info}).data model_info_id = seaweedfs.upload_file(stream=json.dumps(model_info), name='model_info_%s.json' % predictive_model_id) cloudsml.predictive_analytics_api.patch_predictive_model_by_id( predictive_model_id, [ { "op": "replace", "path": "/model_seaweed_id", "value": model_file_id }, { "op": "replace", "path": "/model_info_seaweed_id", "value": model_info_id }, { "op": "replace", "path": "/status", "value": "fitted" }, # TODO use constant here ])