def start_project_with_settings(fake_jobs_df): ''' Run a project for fake_jobs_df :param fake_jobs_df: already enriched dataset :return: project ''' global ts_setting advanced_options = dr.AdvancedOptions( response_cap=0.7, blueprint_threshold=2, smart_downsampled=True, majority_downsampling_rate=ts_setting["downsampling"]) partition = dr.StratifiedTVH(ts_setting["holdout_pct"], ts_setting["validation_pct"], seed=0) pandas_dataset = dr.Dataset.create_from_in_memory_data( data_frame=fake_jobs_df.drop(columns=["job_id"])) project = pandas_dataset.create_project( project_name=ts_setting["project_name"]) project.set_target(target=ts_setting["target"], mode=dr.enums.AUTOPILOT_MODE.QUICK, partitioning_method=partition, advanced_options=advanced_options, worker_count=-1) project.unlock_holdout() project.wait_for_autopilot(verbosity=dr.VERBOSITY_LEVEL.SILENT) return project
def run_cp_project(df, proj_name, target, unique_index, window_col, workers): proj = dr.Project.create(df, proj_name, max_wait=9999) group_partition = dr.GroupTVH(holdout_pct=0, validation_pct=20, partition_key_cols=[unique_index]) mono_up = [window_col] flist_mono_up = proj.create_featurelist(name='mono_up', features=mono_up) advanced_options = dr.AdvancedOptions( monotonic_increasing_featurelist_id=flist_mono_up.id, only_include_monotonic_blueprints=True) proj.set_target(target=target, positive_class=1, partitioning_method=group_partition, mode=dr.AUTOPILOT_MODE.FULL_AUTO, max_wait=9999, advanced_options=advanced_options) proj.set_worker_count(workers) proj.wait_for_autopilot() return proj
def run_dr_project(mlbench_project): """ Given the metadata for an MLbench dataset, run a modeling project in DataRobot. """ logger.info('DataRobot: Creating project...') dr_project = dr.Project.create(mlbench_project['train_dataset'], project_name=mlbench_project['name']) logger.info('DataRobot: Aim...') dr_project.set_target( target=mlbench_project['target_name'], metric=mlbench_project['metric'], partitioning_method=dr.StratifiedCV(holdout_pct=20, reps=5), advanced_options=dr.AdvancedOptions(accuracy_optimized_mb=True), worker_count=MAX_DATAROBOT_WORKERS, ) logger.info('DataRobot: Waiting for autopilot...') dr_project.wait_for_autopilot() return dr_project
# see available metrics (optional) project.get_metrics('was_delayed')['available_metrics'] # custom feature lists (optional) featurelist = project.create_featurelist('myfeatures', list(df.columns.values)) # for other advanced options see the docs, e.g. # https://datarobot-public-api-client.readthedocs-hosted.com/en/v2.17.0/autodoc/api_reference.html#advanced-options-api # run autopilot with more accurate models project.set_target( target='was_delayed', featurelist_id=featurelist.id, metric='AUC', advanced_options=dr.AdvancedOptions(accuracy_optimized_mb=True), mode=dr.AUTOPILOT_MODE.FULL_AUTO, worker_count=-1) project.wait_for_autopilot() # run a custom model - e.g. Fasttext word embeddings blueprints = project.get_blueprints() fasttext = [ bp for bp in blueprints if any('Fasttext' in p for p in bp.processes) ] for f in fasttext: job = project.train(f, sample_pct=64, source_project_id=project.id, scoring_type=dr.enums.SCORING_TYPE.cross_validation) model = dr.models.modeljob.wait_for_async_model_creation(project.id, job)
def create_dr_project(df, project_name, ts_settings, **advanced_options): """ Kickoff single DataRobot project df: pandas df project_name: name of project ts_settings: dictionary of parameters for time series project Returns: -------- DataRobot project object """ print(f'Building Next Project \n...\n') ####################### # Get Advanced Options ####################### opts = { 'weights': None, 'response_cap': None, 'blueprint_threshold': None, 'seed': None, 'smart_downsampled': False, 'majority_downsampling_rate': None, 'offset': None, 'exposure': None, 'accuracy_optimized_mb': None, 'scaleout_modeling_mode': None, 'events_count': None, 'monotonic_increasing_featurelist_id': None, 'monotonic_decreasing_featurelist_id': None, 'only_include_monotonic_blueprints': None, } for opt in advanced_options.items(): opts[opt[0]] = opt[1] opts = dr.AdvancedOptions( weights=opts['weights'], seed=opts['seed'], monotonic_increasing_featurelist_id=opts[ 'monotonic_increasing_featurelist_id'], monotonic_decreasing_featurelist_id=opts[ 'monotonic_decreasing_featurelist_id'], only_include_monotonic_blueprints=opts[ 'only_include_monotonic_blueprints'], accuracy_optimized_mb=opts['accuracy_optimized_mb'], smart_downsampled=opts['smart_downsampled'], ) ############################ # Get Datetime Specification ############################ settings = { 'max_date': None, 'known_in_advance': None, 'num_backtests': None, 'validation_duration': None, 'holdout_duration': None, 'holdout_start_date': None, 'disable_holdout': False, 'number_of_backtests': None, 'backtests': None, 'use_cross_series_features': None, 'aggregation_type': None, 'cross_series_group_by_columns': None, 'calendar_id': None, 'use_time_series': False, 'series_id': None, 'metric': None, 'target': None, 'mode': dr.AUTOPILOT_MODE.FULL_AUTO, # MANUAL #QUICK 'date_col': None, 'fd_start': None, 'fd_end': None, 'fdw_start': None, 'fdw_end': None, } for s in ts_settings.items(): settings[s[0]] = s[1] df[settings['date_col']] = pd.to_datetime(df[settings['date_col']]) if settings['max_date'] is None: settings['max_date'] = df[settings['date_col']].max() else: settings['max_date'] = pd.to_datetime(settings['max_date']) if ts_settings['known_in_advance']: settings['known_in_advance'] = [ dr.FeatureSettings(feat_name, known_in_advance=True) for feat_name in settings['known_in_advance'] ] # Update validation and holdout duration, start, and end date project_time_unit, project_time_step = get_timestep(df, settings) validation_durations = {'minute': 0, 'hour': 0, 'day': 0, 'month': 0} holdout_durations = {'minute': 0, 'hour': 0, 'day': 0, 'month': 0} if project_time_unit == 'minute': validation_durations['minute'] = settings['validation_duration'] holdout_durations['minute'] = settings['holdout_duration'] elif project_time_unit == 'hour': validation_durations['hour'] = settings['validation_duration'] holdout_durations['hour'] = settings['holdout_duration'] elif project_time_unit == 'day': validation_durations['day'] = settings['validation_duration'] holdout_durations['day'] = settings['holdout_duration'] elif project_time_unit == 'week': validation_durations['day'] = settings['validation_duration'] * 7 holdout_durations['day'] = settings['holdout_duration'] * 7 elif project_time_unit == 'month': validation_durations['day'] = settings['validation_duration'] * 31 holdout_durations['day'] = settings['holdout_duration'] * 31 else: raise ValueError(f'{project_time_unit} is not a supported timestep') if settings['disable_holdout']: settings['holdout_duration'] = None settings['holdout_start_date'] = None else: settings['holdout_start_date'] = settings['max_date'] - dt.timedelta( minutes=holdout_durations['minute'], hours=holdout_durations['hour'], days=holdout_durations['day'], ) settings[ 'holdout_duration'] = dr.partitioning_methods.construct_duration_string( minutes=holdout_durations['minute'], hours=holdout_durations['hour'], days=holdout_durations['day'], ) ############################### # Create Datetime Specification ############################### time_partition = dr.DatetimePartitioningSpecification( feature_settings=settings['known_in_advance'], # gap_duration = dr.partitioning_methods.construct_duration_string(years=0, months=0, days=0), validation_duration=dr.partitioning_methods.construct_duration_string( minutes=validation_durations['minute'], hours=validation_durations['hour'], days=validation_durations['day'], ), datetime_partition_column=settings['date_col'], use_time_series=settings['use_time_series'], disable_holdout=settings[ 'disable_holdout'], # set this if disable_holdout is set to False holdout_start_date=settings['holdout_start_date'], holdout_duration=settings[ 'holdout_duration'], # set this if disable_holdout is set to False multiseries_id_columns=[settings['series_id']], forecast_window_start=int(settings['fd_start']), forecast_window_end=int(settings['fd_end']), feature_derivation_window_start=int(settings['fdw_start']), feature_derivation_window_end=int(settings['fdw_end']), number_of_backtests=settings['num_backtests'], calendar_id=settings['calendar_id'], use_cross_series_features=settings['use_cross_series_features'], aggregation_type=settings['aggregation_type'], cross_series_group_by_columns=settings[ 'cross_series_group_by_columns'], ) ################ # Create Project ################ project = dr.Project.create(project_name=project_name, sourcedata=df, max_wait=14400, read_timeout=14400) print(f'Creating project {project_name} ...') ################# # Start Autopilot ################# project.set_target( target=settings['target'], metric=settings['metric'], mode=settings['mode'], advanced_options=opts, worker_count=-1, partitioning_method=time_partition, max_wait=14400, ) return project