proj = dr.Project.create(sourcedata=filename, project_name=project_name, max_wait=3600) print('Project ID: {}'.format(proj.id)) # ## Identify Known-In-Advance Features # This dataset has five columns that will always be known-in-advance and available for prediction. # In[5]: known_in_advance = ['Marketing', 'Near_Xmas', 'Near_BlackFriday', 'Holiday', 'DestinationEvent'] feature_settings = [dr.FeatureSettings(feat_name, known_in_advance=True) for feat_name in known_in_advance] # ## Create a Partition Specification # This problem has a time component to it, and it would be bad practice to train on data from the present and predict on the past. We could manually add a column to the dataset to indicate which rows should be used for training, test, and validation, but it is straightforward to allow DataRobot to do it automatically. This dataset contains sales data from multiple individual stores so we use `multiseries_id_columns` to tell DataRobot there are actually multiple time series in this file and to indicate the column that identifies the series each row belongs to. # In[6]: time_partition = dr.DatetimePartitioningSpecification( datetime_partition_column='Date', multiseries_id_columns=['Store'], use_time_series=True, feature_settings=feature_settings, )
def create_dr_project(df, project_name, ts_settings, **advanced_options): """ Kickoff single DataRobot project df: pandas df project_name: name of project ts_settings: dictionary of parameters for time series project Returns: -------- DataRobot project object """ print(f'Building Next Project \n...\n') ####################### # Get Advanced Options ####################### opts = { 'weights': None, 'response_cap': None, 'blueprint_threshold': None, 'seed': None, 'smart_downsampled': False, 'majority_downsampling_rate': None, 'offset': None, 'exposure': None, 'accuracy_optimized_mb': None, 'scaleout_modeling_mode': None, 'events_count': None, 'monotonic_increasing_featurelist_id': None, 'monotonic_decreasing_featurelist_id': None, 'only_include_monotonic_blueprints': None, } for opt in advanced_options.items(): opts[opt[0]] = opt[1] opts = dr.AdvancedOptions( weights=opts['weights'], seed=opts['seed'], monotonic_increasing_featurelist_id=opts[ 'monotonic_increasing_featurelist_id'], monotonic_decreasing_featurelist_id=opts[ 'monotonic_decreasing_featurelist_id'], only_include_monotonic_blueprints=opts[ 'only_include_monotonic_blueprints'], accuracy_optimized_mb=opts['accuracy_optimized_mb'], smart_downsampled=opts['smart_downsampled'], ) ############################ # Get Datetime Specification ############################ settings = { 'max_date': None, 'known_in_advance': None, 'num_backtests': None, 'validation_duration': None, 'holdout_duration': None, 'holdout_start_date': None, 'disable_holdout': False, 'number_of_backtests': None, 'backtests': None, 'use_cross_series_features': None, 'aggregation_type': None, 'cross_series_group_by_columns': None, 'calendar_id': None, 'use_time_series': False, 'series_id': None, 'metric': None, 'target': None, 'mode': dr.AUTOPILOT_MODE.FULL_AUTO, # MANUAL #QUICK 'date_col': None, 'fd_start': None, 'fd_end': None, 'fdw_start': None, 'fdw_end': None, } for s in ts_settings.items(): settings[s[0]] = s[1] df[settings['date_col']] = pd.to_datetime(df[settings['date_col']]) if settings['max_date'] is None: settings['max_date'] = df[settings['date_col']].max() else: settings['max_date'] = pd.to_datetime(settings['max_date']) if ts_settings['known_in_advance']: settings['known_in_advance'] = [ dr.FeatureSettings(feat_name, known_in_advance=True) for feat_name in settings['known_in_advance'] ] # Update validation and holdout duration, start, and end date project_time_unit, project_time_step = get_timestep(df, settings) validation_durations = {'minute': 0, 'hour': 0, 'day': 0, 'month': 0} holdout_durations = {'minute': 0, 'hour': 0, 'day': 0, 'month': 0} if project_time_unit == 'minute': validation_durations['minute'] = settings['validation_duration'] holdout_durations['minute'] = settings['holdout_duration'] elif project_time_unit == 'hour': validation_durations['hour'] = settings['validation_duration'] holdout_durations['hour'] = settings['holdout_duration'] elif project_time_unit == 'day': validation_durations['day'] = settings['validation_duration'] holdout_durations['day'] = settings['holdout_duration'] elif project_time_unit == 'week': validation_durations['day'] = settings['validation_duration'] * 7 holdout_durations['day'] = settings['holdout_duration'] * 7 elif project_time_unit == 'month': validation_durations['day'] = settings['validation_duration'] * 31 holdout_durations['day'] = settings['holdout_duration'] * 31 else: raise ValueError(f'{project_time_unit} is not a supported timestep') if settings['disable_holdout']: settings['holdout_duration'] = None settings['holdout_start_date'] = None else: settings['holdout_start_date'] = settings['max_date'] - dt.timedelta( minutes=holdout_durations['minute'], hours=holdout_durations['hour'], days=holdout_durations['day'], ) settings[ 'holdout_duration'] = dr.partitioning_methods.construct_duration_string( minutes=holdout_durations['minute'], hours=holdout_durations['hour'], days=holdout_durations['day'], ) ############################### # Create Datetime Specification ############################### time_partition = dr.DatetimePartitioningSpecification( feature_settings=settings['known_in_advance'], # gap_duration = dr.partitioning_methods.construct_duration_string(years=0, months=0, days=0), validation_duration=dr.partitioning_methods.construct_duration_string( minutes=validation_durations['minute'], hours=validation_durations['hour'], days=validation_durations['day'], ), datetime_partition_column=settings['date_col'], use_time_series=settings['use_time_series'], disable_holdout=settings[ 'disable_holdout'], # set this if disable_holdout is set to False holdout_start_date=settings['holdout_start_date'], holdout_duration=settings[ 'holdout_duration'], # set this if disable_holdout is set to False multiseries_id_columns=[settings['series_id']], forecast_window_start=int(settings['fd_start']), forecast_window_end=int(settings['fd_end']), feature_derivation_window_start=int(settings['fdw_start']), feature_derivation_window_end=int(settings['fdw_end']), number_of_backtests=settings['num_backtests'], calendar_id=settings['calendar_id'], use_cross_series_features=settings['use_cross_series_features'], aggregation_type=settings['aggregation_type'], cross_series_group_by_columns=settings[ 'cross_series_group_by_columns'], ) ################ # Create Project ################ project = dr.Project.create(project_name=project_name, sourcedata=df, max_wait=14400, read_timeout=14400) print(f'Creating project {project_name} ...') ################# # Start Autopilot ################# project.set_target( target=settings['target'], metric=settings['metric'], mode=settings['mode'], advanced_options=opts, worker_count=-1, partitioning_method=time_partition, max_wait=14400, ) return project