def _projection(projection_config, data_cleaned, config, train_mode, suffix, **kwargs): (DecompositionTransformer, transformer_config, transformer_name) = projection_config if train_mode: data_cleaned, data_cleaned_valid = data_cleaned projector = Step(name='{}{}'.format(transformer_name, suffix), transformer=DecompositionTransformer(**transformer_config), input_steps=[data_cleaned], adapter=Adapter({'features': E(data_cleaned.name, 'numerical_features')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) projector_pandas = Step(name='{}_pandas{}'.format(transformer_name, suffix), transformer=make_transformer(partial(to_pandas, column_prefix=transformer_name) , output_name='numerical_features'), input_steps=[projector], adapter=Adapter({'x': E(projector.name, 'features')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) if train_mode: projector_valid = Step(name='{}_valid{}'.format(transformer_name, suffix), transformer=projector, input_steps=[data_cleaned_valid], adapter=Adapter({'features': E(data_cleaned_valid.name, 'numerical_features')} ), experiment_directory=config.pipeline.experiment_directory, **kwargs) projector_pandas_valid = Step(name='{}_pandas_valid{}'.format(transformer_name, suffix), transformer=projector_pandas, input_steps=[projector_valid], adapter=Adapter({'x': E(projector_valid.name, 'features')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) return projector_pandas, projector_pandas_valid else: return projector_pandas
def _credit_card_balance_groupby_agg(config, train_mode, suffix, **kwargs): credit_card_balance_groupby_agg = Step( name='credit_card_balance_groupby_agg{}'.format(suffix), transformer=fe.GroupbyAggregateMerge(**config.credit_card_balance), input_data=['application', 'credit_card_balance'], adapter=Adapter({ 'main_table': E('application', 'X'), 'side_table': E('credit_card_balance', 'X') }), experiment_directory=config.pipeline.experiment_directory, **kwargs) if train_mode: credit_card_balance_groupby_agg_valid = Step( name='credit_card_balance_groupby_agg_valid{}'.format(suffix), transformer=credit_card_balance_groupby_agg, input_data=['application', 'credit_card_balance'], adapter=Adapter({ 'main_table': E('application', 'X_valid'), 'side_table': E('credit_card_balance', 'X') }), experiment_directory=config.pipeline.experiment_directory, **kwargs) return credit_card_balance_groupby_agg, credit_card_balance_groupby_agg_valid else: return credit_card_balance_groupby_agg
def _previous_applications_groupby_agg(config, train_mode, suffix, **kwargs): previous_applications_groupby_agg = Step( name='previous_applications_groupby_agg{}'.format(suffix), transformer=fe.GroupbyAggregateMerge(**config.previous_applications), input_data=['application', 'previous_application'], adapter=Adapter({ 'main_table': E('application', 'X'), 'side_table': E('previous_application', 'X') }), experiment_directory=config.pipeline.experiment_directory, **kwargs) if train_mode: previous_applications_groupby_agg_valid = Step( name='previous_applications_groupby_agg_valid{}'.format(suffix), transformer=previous_applications_groupby_agg, input_data=['application', 'previous_application'], adapter=Adapter({ 'main_table': E('application', 'X_valid'), 'side_table': E('previous_application', 'X') }), experiment_directory=config.pipeline.experiment_directory, **kwargs) return previous_applications_groupby_agg, previous_applications_groupby_agg_valid else: return previous_applications_groupby_agg
def _tap4fun_groupby_agg(config, train_mode, suffix, **kwargs): tap4fun_groupby_agg = Step( name='tap4fun_groupby_agg{}'.format(suffix), transformer=fe.GroupbyAggregate(**config.tap4fun.aggregations), is_trainable=True, input_data=['tap4fun'], adapter=Adapter({'main_table': E('tap4fun', 'X')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) if train_mode: tap4fun_groupby_agg_valid = Step( name='tap4fun_groupby_agg_valid{}'.format(suffix), transformer=tap4fun_groupby_agg.transformer, input_data=['tap4fun'], adapter=Adapter({ 'main_table': E('tap4fun', 'X_valid'), }), experiment_directory=config.pipeline.experiment_directory, **kwargs) return tap4fun_groupby_agg, tap4fun_groupby_agg_valid else: return tap4fun_groupby_agg
def _tap4fun(config, train_mode, suffix, **kwargs): if train_mode: tap4fun_cleaning, tap4fun_cleaning_valid = _tap4fun_cleaning( config, train_mode, suffix, **kwargs) else: tap4fun_cleaning = _tap4fun_cleaning(config, train_mode, suffix, **kwargs) tap4fun = Step(name='tap4fun_hand_crafted{}'.format(suffix), transformer=fe.Tap4funFeatures(**config.tap4fun.columns), input_steps=[tap4fun_cleaning], adapter=Adapter({'X': E(tap4fun_cleaning.name, 'X')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) if train_mode: tap4fun_valid = Step( name='tap4fun_hand_crafted_valid{}'.format(suffix), transformer=tap4fun.transformer, input_steps=[tap4fun_cleaning_valid], adapter=Adapter({'X': E(tap4fun_cleaning_valid.name, 'X')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) return tap4fun, tap4fun_valid else: return tap4fun
def postprocessing_pipeline_simplified(cache_dirpath, loader_mode): if loader_mode == 'resize_and_pad': size_adjustment_function = partial(crop_image, target_size=ORIGINAL_SIZE) elif loader_mode == 'resize': size_adjustment_function = partial(resize_image, target_size=ORIGINAL_SIZE) else: raise NotImplementedError mask_resize = Step(name='mask_resize', transformer=make_apply_transformer(size_adjustment_function, output_name='resized_images', apply_on=['images']), input_data=['unet_output'], adapter=Adapter({'images': E('unet_output', 'mask_prediction'), }), experiment_directory=cache_dirpath) binarizer = Step(name='binarizer', transformer=make_apply_transformer( partial(binarize, threshold=THRESHOLD), output_name='binarized_images', apply_on=['images']), input_steps=[mask_resize], adapter=Adapter({'images': E(mask_resize.name, 'resized_images'), }), experiment_directory=cache_dirpath) output = Step(name='output', transformer=IdentityOperation(), input_steps=[binarizer], adapter=Adapter({'y_pred': E(binarizer.name, 'binarized_images'), }), experiment_directory=cache_dirpath) return output
def select_features_from_model(features, features_valid, config, train_mode, suffix, **kwargs): select_features_step = Step( name='select_features_from_model{}'.format(suffix), transformer=fe.SelectFeaturesFromModel(threshold='median'), input_data=['tap4fun'], input_steps=[features], is_trainable=True, adapter=Adapter({ 'X': E(features.name, 'features'), 'y': E('tap4fun', 'y') }), experiment_directory=config.pipeline.experiment_directory, ) if train_mode: select_features_valid_step = Step( name='select_features_from_model_valid{}'.format(suffix), transformer=select_features_step.transformer, input_steps=[features_valid], adapter=Adapter({'X': E(features_valid.name, 'features')}), experiment_directory=config.pipeline.experiment_directory, ) return select_features_step, select_features_valid_step else: return select_features_step
def retinanet(config, train_mode, visualize=False): persist_output = False load_persisted_output = False loader = preprocessing_generator(config, is_train=train_mode) retinanet = Step(name='retinanet', transformer=Retina(**config.retinanet, train_mode=train_mode), input_steps=[loader], experiment_directory=config.env.cache_dirpath, persist_output=persist_output, is_trainable=True, load_persisted_output=load_persisted_output) if train_mode: return retinanet if visualize: return visualizer(retinanet, loader.get_step('label_encoder'), config) postprocessor = postprocessing(retinanet, loader.get_step('label_encoder'), config) output = Step(name='output', transformer=IdentityOperation(), input_steps=[postprocessor], adapter=Adapter( {'y_pred': E(postprocessor.name, 'submission')}), experiment_directory=config.env.cache_dirpath, persist_output=persist_output, load_persisted_output=load_persisted_output) return output
def visualizer(model, label_encoder, config): label_decoder = Step(name='label_decoder', transformer=GoogleAiLabelDecoder(), input_steps=[ label_encoder, ], experiment_directory=config.env.cache_dirpath) decoder = Step( name='decoder', transformer=DataDecoder(**config.postprocessing.data_decoder), input_data=['input'], input_steps=[ model, ], experiment_directory=config.env.cache_dirpath) visualize = Step(name='visualizer', transformer=Visualizer(), input_steps=[label_decoder, decoder], input_data=['input'], adapter=Adapter({ 'images_data': E('input', 'images_data'), 'results': E(decoder.name, 'results'), 'decoder_dict': E(label_decoder.name, 'inverse_mapping') }), experiment_directory=config.env.cache_dirpath) return visualize
def _numerical_transforms(dispatchers, config, train_mode, suffix, **kwargs): if train_mode: feature_by_type_split, feature_by_type_split_valid = dispatchers else: feature_by_type_split = dispatchers log_num = Step( name='log_num{}'.format(suffix), transformer=make_transformer(lambda x: np.log(x + 1), output_name='numerical_features'), input_steps=[feature_by_type_split], adapter=Adapter( {'x': E(feature_by_type_split.name, 'numerical_features')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) if train_mode: log_num_valid = Step( name='log_num_valid{}'.format(suffix), transformer=log_num, input_steps=[feature_by_type_split_valid], adapter=Adapter({ 'x': E(feature_by_type_split_valid.name, 'numerical_features') }), experiment_directory=config.pipeline.experiment_directory, **kwargs) return log_num, log_num_valid else: return log_num
def postprocessing(model, label_encoder, config): label_decoder = Step(name='label_decoder', transformer=GoogleAiLabelDecoder(), input_steps=[ label_encoder, ], experiment_directory=config.env.cache_dirpath) decoder = Step( name='decoder', transformer=DataDecoder(**config.postprocessing.data_decoder), input_steps=[ model, ], experiment_directory=config.env.cache_dirpath) submission_producer = Step( name='submission_producer', transformer=PredictionFormatter( **config.postprocessing.prediction_formatter), input_steps=[label_decoder, decoder], input_data=['input'], adapter=Adapter({ 'image_ids': E('input', 'img_ids'), 'results': E(decoder.name, 'results'), 'decoder_dict': E(label_decoder.name, 'inverse_mapping') }), experiment_directory=config.env.cache_dirpath) return submission_producer
def _feature_by_type_splits(config, train_mode, suffix): if train_mode: feature_by_type_split = Step( name='inferred_type_splitter{}'.format(suffix), transformer=fe.InferredTypeSplitter(), input_data=['input'], adapter=Adapter({'X': E('input', 'X')}), experiment_directory=config.pipeline.experiment_directory) feature_by_type_split_valid = Step( name='inferred_type_splitter_valid{}'.format(suffix), transformer=feature_by_type_split, input_data=['input'], adapter=Adapter({'X': E('input', 'X_valid')}), experiment_directory=config.pipeline.experiment_directory) return feature_by_type_split, feature_by_type_split_valid else: feature_by_type_split = Step( name='inferred_type_splitter{}'.format(suffix), transformer=fe.InferredTypeSplitter(), input_data=['input'], adapter=Adapter({'X': E('input', 'X')}), experiment_directory=config.pipeline.experiment_directory) return feature_by_type_split
def preprocessing_inference(config, model_name='unet', suffix=''): if config.general.loader_mode == 'resize_and_pad': loader_config = config.loaders.resize_and_pad elif config.general.loader_mode == 'resize': loader_config = config.loaders.resize else: raise NotImplementedError if loader_config.dataset_params.image_source == 'memory': reader_inference = Step(name='reader_inference{}'.format(suffix), transformer=loaders.ImageReader(train_mode=False, **config.reader[model_name]), input_data=['input'], adapter=Adapter({'meta': E('input', 'meta')}), experiment_directory=config.execution.experiment_dir) elif loader_config.dataset_params.image_source == 'disk': reader_inference = Step(name='xy_inference{}'.format(suffix), transformer=loaders.XYSplit(train_mode=False, **config.xy_splitter[model_name]), input_data=['input'], adapter=Adapter({'meta': E('input', 'meta')}), experiment_directory=config.execution.experiment_dir) else: raise NotImplementedError loader = Step(name='loader{}'.format(suffix), transformer=loaders.ImageSegmentationLoader(train_mode=False, **loader_config), input_steps=[reader_inference], adapter=Adapter({'X': E(reader_inference.name, 'X'), 'y': E(reader_inference.name, 'y'), }), experiment_directory=config.execution.experiment_dir, cache_output=True) return loader
def test_inputs_with_conflicting_names_require_adapter(data): step = Step(name='test_inputs_with_conflicting_names_require_adapter', transformer=IdentityOperation(), input_data=['input_1', 'input_3'], experiment_directory=EXP_DIR) with pytest.raises(StepsError): step.fit_transform(data)
def preprocessing_inference_tta(config, model_name='unet', suffix=''): reader_inference = Step(name='reader_inference{}'.format(suffix), transformer=loaders.XYSplit( train_mode=False, **config.xy_splitter[model_name]), input_data=['input'], adapter=Adapter({'meta': E('input', 'meta')}), experiment_directory=config.env.experiment_dir) tta_generator = Step(name='tta_generator{}'.format(suffix), transformer=loaders.MetaTestTimeAugmentationGenerator( **config.tta_generator), input_steps=[reader_inference], adapter=Adapter({'X': E('reader_inference', 'X')}), experiment_directory=config.env.experiment_dir) if config.execution.loader_mode == 'crop_and_pad': Loader = loaders.ImageSegmentationLoaderCropPadTTA elif config.execution.loader_mode == 'resize': Loader = loaders.ImageSegmentationLoaderResizeTTA else: raise NotImplementedError loader = Step(name='loader{}'.format(suffix), transformer=Loader(**config.loader), input_steps=[tta_generator], adapter=Adapter({ 'X': E(tta_generator.name, 'X_tta'), 'tta_params': E(tta_generator.name, 'tta_params'), }), experiment_directory=config.env.experiment_dir, cache_output=True) return loader, tta_generator
def preprocessing_fillna(features, config, train_mode, suffix, **kwargs): """ impute missing value by condition """ if train_mode: features_train, features_valid = features fillna = Step( name='fillna{}'.format(suffix), transformer=_fillna( config.preprocessing.impute_missing.fill_value), input_steps=[features_train, features_valid], adapter=Adapter({ 'X': E(features_train.name, 'features'), 'X_valid': E(features_valid.name, 'features'), }), experiment_directory=config.pipeline.experiment_directory, **kwargs) else: fillna = Step( name='fillna{}'.format(suffix), transformer=_fillna( config.preprocessing.impute_missing.fill_value), input_steps=[features], adapter=Adapter({'X': E(features.name, 'features')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) return fillna
def _application_groupby_agg(config, train_mode, suffix, **kwargs): application_groupby_agg = Step( name='application_groupby_agg{}'.format(suffix), transformer=fe.GroupbyAggregateDiffs( **config.applications.aggregations), input_data=['application'], adapter=Adapter({'main_table': E('application', 'X')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) if train_mode: application_groupby_agg_valid = Step( name='application_groupby_agg_valid{}'.format(suffix), transformer=application_groupby_agg, input_data=['application'], adapter=Adapter({ 'main_table': E('application', 'X_valid'), }), experiment_directory=config.pipeline.experiment_directory, **kwargs) return application_groupby_agg, application_groupby_agg_valid else: return application_groupby_agg
def preprocessing_inference(config, model_name='unet', suffix=''): if config.execution.loader_mode == 'crop_and_pad': Loader = loaders.ImageSegmentationLoaderCropPad elif config.execution.loader_mode == 'resize': Loader = loaders.ImageSegmentationLoaderResize else: raise NotImplementedError reader_inference = Step(name='xy_inference{}'.format(suffix), transformer=loaders.XYSplit( train_mode=False, **config.xy_splitter[model_name]), input_data=['input'], adapter=Adapter({'meta': E('input', 'meta')}), experiment_directory=config.env.experiment_dir) loader = Step(name='loader{}'.format(suffix), transformer=Loader(train_mode=False, **config.loader), input_steps=[reader_inference], adapter=Adapter({ 'X': E(reader_inference.name, 'X'), 'y': E(reader_inference.name, 'y'), }), experiment_directory=config.env.experiment_dir, cache_output=True) return loader
def _bureau(config, train_mode, **kwargs): if train_mode: bureau = Step(name='bureau', transformer=fe.GroupbyAggregationFromFile(**config.bureau), input_data=['input'], adapter=Adapter({'X': E('input', 'X')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) bureau_valid = Step(name='bureau_valid', transformer=bureau, input_data=['input'], adapter=Adapter({'X': E('input', 'X_valid')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) return bureau, bureau_valid else: bureau = Step(name='bureau', transformer=fe.GroupbyAggregationFromFile(**config.bureau), input_data=['input'], adapter=Adapter({'X': E('input', 'X')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) return bureau
def data_cleaning_v2(config, train_mode, suffix, **kwargs): cleaned_data = data_cleaning_v1(config, train_mode, suffix, **kwargs) if train_mode: cleaned_data, cleaned_data_valid = cleaned_data impute_missing = Step(name='dummies_missing{}'.format(suffix), transformer=dc.DummiesMissing(**config.dummies_missing), input_steps=[cleaned_data], adapter=Adapter({'X': E(cleaned_data.name, 'numerical_features'), } ), experiment_directory=config.pipeline.experiment_directory, **kwargs) if train_mode: impute_missing_valid = Step(name='dummies_missing_valid{}'.format(suffix), transformer=impute_missing, input_steps=[cleaned_data_valid], adapter=Adapter({'X': E(cleaned_data_valid.name, 'numerical_features'), } ), experiment_directory=config.pipeline.experiment_directory, **kwargs) return impute_missing, impute_missing_valid else: return impute_missing
def mask_postprocessing(config, suffix=''): if config.general.loader_mode == 'crop_and_pad': size_adjustment_function = partial( crop_image, target_size=config.general.original_size) elif config.general.loader_mode == 'resize': size_adjustment_function = partial( resize_image, target_size=config.general.original_size) else: raise NotImplementedError mask_resize = Step(name='mask_resize{}'.format(suffix), transformer=make_apply_transformer( size_adjustment_function, output_name='resized_images', apply_on=['images']), input_data=['input_masks'], adapter=Adapter({ 'images': E('input_masks', 'mask_prediction'), }), experiment_directory=config.execution.experiment_dir) binarizer = Step(name='binarizer{}'.format(suffix), transformer=make_apply_transformer( partial(binarize, threshold=config.thresholder.threshold_masks), output_name='binarized_images', apply_on=['images']), input_steps=[mask_resize], adapter=Adapter({ 'images': E(mask_resize.name, 'resized_images'), }), experiment_directory=config.execution.experiment_dir) return binarizer
def _feature_by_type_splits(config, train_mode): if train_mode: feature_by_type_split = Step(name='feature_by_type_split', transformer=fe.DataFrameByTypeSplitter( **config.dataframe_by_type_splitter), input_data=['input'], adapter=Adapter({'X': E('input', 'X')}), cache_dirpath=config.env.cache_dirpath) feature_by_type_split_valid = Step( name='feature_by_type_split_valid', transformer=feature_by_type_split, input_data=['input'], adapter=Adapter({'X': E('input', 'X_valid')}), cache_dirpath=config.env.cache_dirpath) return feature_by_type_split, feature_by_type_split_valid else: feature_by_type_split = Step(name='feature_by_type_split', transformer=fe.DataFrameByTypeSplitter( **config.dataframe_by_type_splitter), input_data=['input'], adapter=Adapter({'X': E('input', 'X')}), cache_dirpath=config.env.cache_dirpath) return feature_by_type_split
def preprocessing_binary_train(config, model_name, suffix='_binary_model'): reader_train = Step(name='xy_train{}'.format(suffix), transformer=loaders.MetaReader( train_mode=True, **config.meta_reader[model_name]), input_data=['input'], adapter=Adapter({'meta': E('input', 'meta')})) reader_inference = Step( name='xy_inference{}'.format(suffix), transformer=loaders.MetaReader(train_mode=True, **config.meta_reader[model_name]), input_data=['callback_input'], adapter=Adapter({'meta': E('callback_input', 'meta_valid')})) transformer = OneClassImageClassificatioLoader( train_mode=True, loader_params=config.loaders.resize.loader_params, dataset_params=config.loaders.resize.dataset_params, augmentation_params=config.loaders.resize.augmentation_params) binary_loader = Step(name='loader{}'.format(suffix), transformer=transformer, input_steps=[reader_train, reader_inference], adapter=Adapter({ 'X': E(reader_train.name, 'X'), 'y': E(reader_train.name, 'y'), 'X_valid': E(reader_inference.name, 'X'), 'y_valid': E(reader_inference.name, 'y'), })) return binary_loader
def preprocessing_train(config, model_name='network'): if config.general.loader_mode == 'resize': loader_config = config.loaders.resize LOADER = loaders.ImageSegmentationLoaderResize else: raise NotImplementedError reader_train = Step(name='xy_train', transformer=loaders.MetaReader( train_mode=True, **config.meta_reader[model_name]), input_data=['input'], adapter=Adapter({'meta': E('input', 'meta')})) reader_inference = Step( name='xy_inference', transformer=loaders.MetaReader(train_mode=True, **config.meta_reader[model_name]), input_data=['callback_input'], adapter=Adapter({'meta': E('callback_input', 'meta_valid')})) loader = Step(name='loader', transformer=LOADER(train_mode=True, **loader_config), input_steps=[reader_train, reader_inference], adapter=Adapter({ 'X': E(reader_train.name, 'X'), 'y': E(reader_train.name, 'y'), 'X_valid': E(reader_inference.name, 'X'), 'y_valid': E(reader_inference.name, 'y'), })) return loader
def emptiness_preprocessing_train(config, model_name='network', suffix=''): reader_train = Step(name='xy_train{}'.format(suffix), transformer=loaders.XYSplit( train_mode=True, **config.xy_splitter[model_name]), input_data=['input'], adapter=Adapter({'meta': E('input', 'meta')}), experiment_directory=config.execution.experiment_dir) reader_inference = Step( name='xy_inference{}'.format(suffix), transformer=loaders.XYSplit(train_mode=True, **config.xy_splitter[model_name]), input_data=['callback_input'], adapter=Adapter({'meta': E('callback_input', 'meta_valid')}), experiment_directory=config.execution.experiment_dir) loader = Step(name='loader{}'.format(suffix), transformer=loaders.EmptinessLoader(train_mode=True, **config.loaders.resize), input_steps=[reader_train, reader_inference], adapter=Adapter({ 'X': E(reader_train.name, 'X'), 'y': E(reader_train.name, 'y'), 'X_valid': E(reader_inference.name, 'X'), 'y_valid': E(reader_inference.name, 'y'), }), experiment_directory=config.execution.experiment_dir) return loader
def preprocessing_inference_tta(config, model_name='network'): if config.general.loader_mode == 'resize': loader_config = config.loaders.resize_tta LOADER = loaders.ImageSegmentationLoaderResizeTTA else: raise NotImplementedError reader_inference = Step(name='reader_inference', transformer=loaders.MetaReader( train_mode=False, **config.meta_reader[model_name]), input_data=['input'], adapter=Adapter({'meta': E('input', 'meta')})) tta_generator = Step(name='tta_generator', transformer=loaders.MetaTestTimeAugmentationGenerator( **config.tta_generator), input_steps=[reader_inference], adapter=Adapter({'X': E('reader_inference', 'X')})) loader = Step(name='loader', transformer=LOADER(**loader_config), input_steps=[tta_generator], adapter=Adapter({ 'X': E(tta_generator.name, 'X_tta'), 'tta_params': E(tta_generator.name, 'tta_params'), })) return loader, tta_generator
def _application(config, train_mode, suffix, **kwargs): if train_mode: application_cleaning, application_cleaning_valid = _application_cleaning( config, train_mode, suffix, **kwargs) else: application_cleaning = _application_cleaning(config, train_mode, suffix, **kwargs) application = Step( name='application_hand_crafted{}'.format(suffix), transformer=fe.ApplicationFeatures(**config.applications.columns), input_steps=[application_cleaning], adapter=Adapter({'X': E(application_cleaning.name, 'X')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) if train_mode: application_valid = Step( name='application_hand_crafted_valid{}'.format(suffix), transformer=application, input_steps=[application_cleaning_valid], adapter=Adapter({'X': E(application_cleaning_valid.name, 'X')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) return application, application_valid else: return application
def _application_groupby_agg(config, train_mode, suffix, **kwargs): if train_mode: application_cleaning, application_cleaning_valid = _application_cleaning( config, train_mode, suffix, **kwargs) else: application_cleaning = _application_cleaning(config, train_mode, suffix, **kwargs) application_groupby_agg = Step( name='application_groupby_agg{}'.format(suffix), transformer=fe.GroupbyAggregateDiffs( **config.applications.aggregations), input_steps=[application_cleaning], adapter=Adapter({'main_table': E(application_cleaning.name, 'X')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) if train_mode: application_groupby_agg_valid = Step( name='application_groupby_agg_valid{}'.format(suffix), transformer=application_groupby_agg, input_steps=[application_cleaning_valid], adapter=Adapter( {'main_table': E(application_cleaning_valid.name, 'X')}), experiment_directory=config.pipeline.experiment_directory, **kwargs) return application_groupby_agg, application_groupby_agg_valid else: return application_groupby_agg
def _categorical_encoders(config, train_mode, suffix, **kwargs): categorical_encoder = Step( name='categorical_encoder{}'.format(suffix), transformer=fe.CategoricalEncoder( **config.preprocessing.categorical_encoder), input_data=['application'], adapter=Adapter({ 'X': E('application', 'X'), 'y': E('application', 'y') }), experiment_directory=config.pipeline.experiment_directory, **kwargs) if train_mode: categorical_encoder_valid = Step( name='categorical_encoder_valid{}'.format(suffix), transformer=categorical_encoder, input_data=['application'], adapter=Adapter({ 'X': E('application', 'X_valid'), 'y': E('application', 'y_valid') }), experiment_directory=config.pipeline.experiment_directory, **kwargs) return categorical_encoder, categorical_encoder_valid else: return categorical_encoder
def test_inputs_with_conflicting_names_require_adapter(data): step = Step(name='test_inputs_with_conflicting_names_require_adapter', transformer=IdentityOperation(), input_data=['input_1', 'input_3'], cache_dirpath=CACHE_DIRPATH) with pytest.raises(StepsError): step.fit_transform(data)
def main(): data_train = pd.read_csv('../../new_input/app_train.csv', nrows=5000) data_test = pd.read_csv('../../new_input/app_train.csv', nrows=5000) feature_cols = [col for col in data_train.columns if data_train[col].dtypes != 'object'] print(len(feature_cols)) y_train = data_train['TARGET'] X_train = data_train[feature_cols] print(X_train.shape, y_train.shape) # 构建 数据结构 trn_X, val_X, trn_y, val_y = train_test_split(X_train, y_train) train_data = {'input': { 'X': trn_X, 'y': trn_y, } } val_data = {'input': { 'X': val_X, 'y': val_y, } } norm_step = Step(name='Normalizer', transformer=NormalizationTransformer(), input_data=['input'], adapter=Adapter({ 'X': E('input', 'X') }), experiment_directory=EXPERIMENT_DIR) pca_step = Step(name='PCA', transformer=PCATransformer(), input_steps=[norm_step], experiment_directory=EXPERIMENT_DIR) knn_step = Step(name='KNN', transformer=KNNTransformer(), input_data=['input'], input_steps=[pca_step], adapter=Adapter({ 'X': E('PCA', 'X'), 'y': E('input', 'y') }), experiment_directory=EXPERIMENT_DIR) print(knn_step) trn_pred = knn_step.fit_transform(train_data) val_pred = knn_step.transform(val_data) print(roc_auc_score(val_y, val_pred['y_pred']))