def from_dict(_params): params = deepcopy(_params) if len(params['backtests']) > 0: backtests = [] for backtest in params['backtests']: backtest['validation_start_date'] = string_to_datetime( backtest['validation_start_date']) backtests.append(dr.BacktestSpecification(**backtest)) params['backtests'] = backtests params['holdout_start_date'] = string_to_datetime( params['holdout_start_date']) return dr.DatetimePartitioningSpecification(**params)
def setup_basic_time_spec(cf): """ Basic spec for timeseries, using a config. Assumes daily data, and no gap to prediction window. """ spec = dr.DatetimePartitioningSpecification( cf['timecol'], use_time_series=True, default_to_known_in_advance=False) # disable holdout spec.disable_holdout = True # backtest options spec.number_of_backtests = int(cf['backtests']) spec.validation_duration = dr.partitioning_methods.construct_duration_string( days=int(cf['backtest_length'])) # windows spec.feature_derivation_window_start = int(cf['fdw']) spec.feature_derivation_window_end = 0 spec.forecast_window_start = 1 spec.forecast_window_end = int(cf['horizon']) return spec
max_wait=3600) print('Project ID: {}'.format(proj.id)) # What projects are there? # my_projects = dr.Project.list() # proj = my_projects[0] print("Configuring Time Series settings.") # Set up a time series project time_partition = dr.DatetimePartitioningSpecification( use_time_series=True, datetime_partition_column='Date', autopilot_data_selection_method='duration', feature_derivation_window_start='-90', feature_derivation_window_end='0', forecast_window_start='1', forecast_window_end='28', multiseries_id_columns=['Store' ] # in this demo dataset, series are retail stores ) # manually confirm time step and time unit are as expected datetime_feature = dr.Feature.get(proj.id, 'Date') multiseries_props = datetime_feature.get_multiseries_properties(['Store']) print(multiseries_props) # manually check out the partitioning settings like feature derivation window and backtests # to make sure they make sense before moving on full_part = dr.DatetimePartitioning.generate(proj.id, time_partition) print(full_part.feature_derivation_window_start,
'Holiday', 'DestinationEvent'] feature_settings = [dr.FeatureSettings(feat_name, known_in_advance=True) for feat_name in known_in_advance] # ## Create a Partition Specification # This problem has a time component to it, and it would be bad practice to train on data from the present and predict on the past. We could manually add a column to the dataset to indicate which rows should be used for training, test, and validation, but it is straightforward to allow DataRobot to do it automatically. This dataset contains sales data from multiple individual stores so we use `multiseries_id_columns` to tell DataRobot there are actually multiple time series in this file and to indicate the column that identifies the series each row belongs to. # In[6]: time_partition = dr.DatetimePartitioningSpecification( datetime_partition_column='Date', multiseries_id_columns=['Store'], use_time_series=True, feature_settings=feature_settings, ) # ## Run the Automated Modeling Process # Now we can start the modeling process. The target for this problem is called `Sales` and we let DataRobot automatically select the metric for scoring and comparing models. # # The `partitioning_method` is used to specify that we would like DataRobot to use the partitioning schema we specified previously # # Finally, the `worker_count` parameter specifies how many workers should be used for this project. Passing a value of `-1` tells DataRobot to set the worker count to the maximum available to you. You can also specify the exact number of workers to use, but this command will fail if you request more workers than your account allows. If you need more resources than what has been allocated to you, you should think about upgrading your license. # # The second command provides a URL that can be used to see the project execute on the DataRobot UI. # # The last command in this cell is just a blocking loop that periodically checks on the project to see if it is done, printing out the number of jobs in progress and in the queue along the way so you can see progress. The automated model exploration process will occasionally add more jobs to the queue, so don't be alarmed if the number of jobs does not strictly decrease over time.
def create_dr_project(df, project_name, ts_settings, **advanced_options): """ Kickoff single DataRobot project df: pandas df project_name: name of project ts_settings: dictionary of parameters for time series project Returns: -------- DataRobot project object """ print(f'Building Next Project \n...\n') ####################### # Get Advanced Options ####################### opts = { 'weights': None, 'response_cap': None, 'blueprint_threshold': None, 'seed': None, 'smart_downsampled': False, 'majority_downsampling_rate': None, 'offset': None, 'exposure': None, 'accuracy_optimized_mb': None, 'scaleout_modeling_mode': None, 'events_count': None, 'monotonic_increasing_featurelist_id': None, 'monotonic_decreasing_featurelist_id': None, 'only_include_monotonic_blueprints': None, } for opt in advanced_options.items(): opts[opt[0]] = opt[1] opts = dr.AdvancedOptions( weights=opts['weights'], seed=opts['seed'], monotonic_increasing_featurelist_id=opts[ 'monotonic_increasing_featurelist_id'], monotonic_decreasing_featurelist_id=opts[ 'monotonic_decreasing_featurelist_id'], only_include_monotonic_blueprints=opts[ 'only_include_monotonic_blueprints'], accuracy_optimized_mb=opts['accuracy_optimized_mb'], smart_downsampled=opts['smart_downsampled'], ) ############################ # Get Datetime Specification ############################ settings = { 'max_date': None, 'known_in_advance': None, 'num_backtests': None, 'validation_duration': None, 'holdout_duration': None, 'holdout_start_date': None, 'disable_holdout': False, 'number_of_backtests': None, 'backtests': None, 'use_cross_series_features': None, 'aggregation_type': None, 'cross_series_group_by_columns': None, 'calendar_id': None, 'use_time_series': False, 'series_id': None, 'metric': None, 'target': None, 'mode': dr.AUTOPILOT_MODE.FULL_AUTO, # MANUAL #QUICK 'date_col': None, 'fd_start': None, 'fd_end': None, 'fdw_start': None, 'fdw_end': None, } for s in ts_settings.items(): settings[s[0]] = s[1] df[settings['date_col']] = pd.to_datetime(df[settings['date_col']]) if settings['max_date'] is None: settings['max_date'] = df[settings['date_col']].max() else: settings['max_date'] = pd.to_datetime(settings['max_date']) if ts_settings['known_in_advance']: settings['known_in_advance'] = [ dr.FeatureSettings(feat_name, known_in_advance=True) for feat_name in settings['known_in_advance'] ] # Update validation and holdout duration, start, and end date project_time_unit, project_time_step = get_timestep(df, settings) validation_durations = {'minute': 0, 'hour': 0, 'day': 0, 'month': 0} holdout_durations = {'minute': 0, 'hour': 0, 'day': 0, 'month': 0} if project_time_unit == 'minute': validation_durations['minute'] = settings['validation_duration'] holdout_durations['minute'] = settings['holdout_duration'] elif project_time_unit == 'hour': validation_durations['hour'] = settings['validation_duration'] holdout_durations['hour'] = settings['holdout_duration'] elif project_time_unit == 'day': validation_durations['day'] = settings['validation_duration'] holdout_durations['day'] = settings['holdout_duration'] elif project_time_unit == 'week': validation_durations['day'] = settings['validation_duration'] * 7 holdout_durations['day'] = settings['holdout_duration'] * 7 elif project_time_unit == 'month': validation_durations['day'] = settings['validation_duration'] * 31 holdout_durations['day'] = settings['holdout_duration'] * 31 else: raise ValueError(f'{project_time_unit} is not a supported timestep') if settings['disable_holdout']: settings['holdout_duration'] = None settings['holdout_start_date'] = None else: settings['holdout_start_date'] = settings['max_date'] - dt.timedelta( minutes=holdout_durations['minute'], hours=holdout_durations['hour'], days=holdout_durations['day'], ) settings[ 'holdout_duration'] = dr.partitioning_methods.construct_duration_string( minutes=holdout_durations['minute'], hours=holdout_durations['hour'], days=holdout_durations['day'], ) ############################### # Create Datetime Specification ############################### time_partition = dr.DatetimePartitioningSpecification( feature_settings=settings['known_in_advance'], # gap_duration = dr.partitioning_methods.construct_duration_string(years=0, months=0, days=0), validation_duration=dr.partitioning_methods.construct_duration_string( minutes=validation_durations['minute'], hours=validation_durations['hour'], days=validation_durations['day'], ), datetime_partition_column=settings['date_col'], use_time_series=settings['use_time_series'], disable_holdout=settings[ 'disable_holdout'], # set this if disable_holdout is set to False holdout_start_date=settings['holdout_start_date'], holdout_duration=settings[ 'holdout_duration'], # set this if disable_holdout is set to False multiseries_id_columns=[settings['series_id']], forecast_window_start=int(settings['fd_start']), forecast_window_end=int(settings['fd_end']), feature_derivation_window_start=int(settings['fdw_start']), feature_derivation_window_end=int(settings['fdw_end']), number_of_backtests=settings['num_backtests'], calendar_id=settings['calendar_id'], use_cross_series_features=settings['use_cross_series_features'], aggregation_type=settings['aggregation_type'], cross_series_group_by_columns=settings[ 'cross_series_group_by_columns'], ) ################ # Create Project ################ project = dr.Project.create(project_name=project_name, sourcedata=df, max_wait=14400, read_timeout=14400) print(f'Creating project {project_name} ...') ################# # Start Autopilot ################# project.set_target( target=settings['target'], metric=settings['metric'], mode=settings['mode'], advanced_options=opts, worker_count=-1, partitioning_method=time_partition, max_wait=14400, ) return project
# set up time series partition settings if s.fields_known_in_advance: feature_settings = [ FeatureSettings(feat_name, known_in_advance=True) for feat_name in s.fields_known_in_advance ] else: feature_settings = None time_partition = dr.DatetimePartitioningSpecification( datetime_partition_column=s.field_date, use_time_series=True, feature_derivation_window_start=s.feature_derivation_window_start, feature_derivation_window_end=s.feature_derivation_window_end, validation_duration=s.validation_length, gap_duration=s.gap_length, forecast_window_start=s.forecast_window_start, forecast_window_end=s.forecast_window_end, number_of_backtests=s.number_of_backtests, feature_settings=feature_settings) # Create a feature list from Informative Features excluding from Feature to remove fl_informative_features = [ fl for fl in project.get_featurelists() if fl.name == 'Informative Features' ][0] if s.fields_exclude_from_modeling: subset_of_features = list( set(fl_informative_features.features) - set(s.fields_exclude_from_modeling))
def run_multi_target_model_factory(project_name, path_to_data, target_cols, dr_mode, config_file): # SET UP LOCATION FOR STORING RESULTS ensure_results_dir(project_name) results_file = start_results_file(project_name) # TURN THE TARGETS INTO A LIST FOR ITERATION target_list = target_cols.split(',') # EXTRACT THE COLUMN HEADERS AS A SET OF FEATURES df = pd.read_csv(path_to_data) features = df.columns.tolist() for t in target_list: features.remove(t) mode_config = dr_mode.split(':') # ####################################################################### # TODO: Need to force DataRobot Login with supplied credentials # ######################################################################### for targ in target_list: temp_name = project_name + "_" + targ project = dr.Project.create(sourcedata=path_to_data, project_name=temp_name) flist = project.create_featurelist('features', features) if (mode_config[0] == 'OTV'): if (len(mode_config) > 2): partition = dr.DatetimePartitioningSpecification( datetime_partition_column=mode_config[1], gap_duration=mode_config[2]) else: partition = dr.DatetimePartitioningSpecification( datetime_partition_column=mode_config[1]) project.set_target(target=targ, partitioning_method=partition, featurelist_id=flist.id, worker_count=-1) project.wait_for_autopilot() elif (mode_config[0] == 'GRPD'): partition = dr.GroupCV(holdout_pct=10, reps=5, partition_key_cols=[mode_config[1]]) project.set_target(target=targ, partitioning_method=partition, featurelist_id=flist.id, worker_count=-1) project.wait_for_autopilot() elif (mode_config[0] == 'QUICK'): project.set_target(target=targ, mode='Quick', featurelist_id=flist.id, worker_count=-1) project.wait_for_autopilot() else: project.set_target(target=targ, featurelist_id=flist.id, worker_count=-1) project.wait_for_autopilot() # ONCE THE PROJECT COMPLETES WE NEED TO CHOOSE THE MODEL, DEPLOY IT AND STORE THE RESULTS project.unlock_holdout() metric = project.metric model = dr.models.ModelRecommendation.get(project.id).get_model() # THE OLD WAY JUST TOOK THE MODEL AT THE TOP OF THE LEADERBOARD LIST #model = project.get_models()[0] holdout_score = model.metrics[project.metric]['holdout'] prediction_server = dr.PredictionServer.list()[0] deployment_title = project_name + " - " + targ deployment = dr.Deployment.create_from_learning_model( model.id, label=deployment_title, description='Dashboard Factory Model for ' + targ, default_prediction_server_id=prediction_server.id) write_model_results(results_file, targ, project.id, model.id, deployment.id, metric, holdout_score) close_results(results_file) #print("BASE MODE: ", mode_config[0]) #print("BASE TARGET: ", target_list[0]) print("\nrun_multi_target_model_factory Completed")