示例#1
0
 def from_dict(_params):
     params = deepcopy(_params)
     if len(params['backtests']) > 0:
         backtests = []
         for backtest in params['backtests']:
             backtest['validation_start_date'] = string_to_datetime(
                 backtest['validation_start_date'])
             backtests.append(dr.BacktestSpecification(**backtest))
         params['backtests'] = backtests
     params['holdout_start_date'] = string_to_datetime(
         params['holdout_start_date'])
     return dr.DatetimePartitioningSpecification(**params)
示例#2
0
def setup_basic_time_spec(cf):
    """
    Basic spec for timeseries, using a config.
    Assumes daily data, and no gap to prediction window.
    """
    spec = dr.DatetimePartitioningSpecification(
        cf['timecol'], use_time_series=True, default_to_known_in_advance=False)
    # disable holdout
    spec.disable_holdout = True
    # backtest options
    spec.number_of_backtests = int(cf['backtests'])
    spec.validation_duration = dr.partitioning_methods.construct_duration_string(
        days=int(cf['backtest_length']))
    # windows
    spec.feature_derivation_window_start = int(cf['fdw'])
    spec.feature_derivation_window_end = 0
    spec.forecast_window_start = 1
    spec.forecast_window_end = int(cf['horizon'])
    return spec
                         max_wait=3600)

print('Project ID: {}'.format(proj.id))

# What projects are there?
# my_projects = dr.Project.list()
# proj = my_projects[0]

print("Configuring Time Series settings.")
# Set up a time series project
time_partition = dr.DatetimePartitioningSpecification(
    use_time_series=True,
    datetime_partition_column='Date',
    autopilot_data_selection_method='duration',
    feature_derivation_window_start='-90',
    feature_derivation_window_end='0',
    forecast_window_start='1',
    forecast_window_end='28',
    multiseries_id_columns=['Store'
                            ]  # in this demo dataset, series are retail stores
)

# manually confirm time step and time unit are as expected
datetime_feature = dr.Feature.get(proj.id, 'Date')
multiseries_props = datetime_feature.get_multiseries_properties(['Store'])
print(multiseries_props)

# manually check out the partitioning settings like feature derivation window and backtests
# to make sure they make sense before moving on
full_part = dr.DatetimePartitioning.generate(proj.id, time_partition)
print(full_part.feature_derivation_window_start,
                    'Holiday', 'DestinationEvent']

feature_settings = [dr.FeatureSettings(feat_name,
                                       known_in_advance=True)
                    for feat_name in known_in_advance]


# ## Create a Partition Specification
# This problem has a time component to it, and it would be bad practice to train on data from the present and predict on the past. We could manually add a column to the dataset to indicate which rows should be used for training, test, and validation, but it is straightforward to allow DataRobot to do it automatically. This dataset contains sales data from multiple individual stores so we use `multiseries_id_columns` to tell DataRobot there are actually multiple time series in this file and to indicate the column that identifies the series each row belongs to.

# In[6]:


time_partition = dr.DatetimePartitioningSpecification(
    datetime_partition_column='Date',
    multiseries_id_columns=['Store'],
    use_time_series=True,
    feature_settings=feature_settings,
)


# ## Run the Automated Modeling Process
# Now we can start the modeling process. The target for this problem is called `Sales` and we let DataRobot automatically select the metric for scoring and comparing models.
# 
# The `partitioning_method` is used to specify that we would like DataRobot to use the partitioning schema we specified previously
# 
# Finally, the `worker_count` parameter specifies how many workers should be used for this project. Passing a value of `-1` tells DataRobot to set the worker count to the maximum available to you. You can also specify the exact number of workers to use, but this command will fail if you request more workers than your account allows. If you need more resources than what has been allocated to you, you should think about upgrading your license.
# 
# The second command provides a URL that can be used to see the project execute on the DataRobot UI.
# 
# The last command in this cell is just a blocking loop that periodically checks on the project to see if it is done, printing out the number of jobs in progress and in the queue along the way so you can see progress. The automated model exploration process will occasionally add more jobs to the queue, so don't be alarmed if the number of jobs does not strictly decrease over time.
示例#5
0
def create_dr_project(df, project_name, ts_settings, **advanced_options):
    """
    Kickoff single DataRobot project

    df: pandas df
    project_name: name of project
    ts_settings: dictionary of parameters for time series project

    Returns:
    --------
    DataRobot project object

    """

    print(f'Building Next Project \n...\n')

    #######################
    # Get Advanced Options
    #######################
    opts = {
        'weights': None,
        'response_cap': None,
        'blueprint_threshold': None,
        'seed': None,
        'smart_downsampled': False,
        'majority_downsampling_rate': None,
        'offset': None,
        'exposure': None,
        'accuracy_optimized_mb': None,
        'scaleout_modeling_mode': None,
        'events_count': None,
        'monotonic_increasing_featurelist_id': None,
        'monotonic_decreasing_featurelist_id': None,
        'only_include_monotonic_blueprints': None,
    }

    for opt in advanced_options.items():
        opts[opt[0]] = opt[1]

    opts = dr.AdvancedOptions(
        weights=opts['weights'],
        seed=opts['seed'],
        monotonic_increasing_featurelist_id=opts[
            'monotonic_increasing_featurelist_id'],
        monotonic_decreasing_featurelist_id=opts[
            'monotonic_decreasing_featurelist_id'],
        only_include_monotonic_blueprints=opts[
            'only_include_monotonic_blueprints'],
        accuracy_optimized_mb=opts['accuracy_optimized_mb'],
        smart_downsampled=opts['smart_downsampled'],
    )

    ############################
    # Get Datetime Specification
    ############################
    settings = {
        'max_date': None,
        'known_in_advance': None,
        'num_backtests': None,
        'validation_duration': None,
        'holdout_duration': None,
        'holdout_start_date': None,
        'disable_holdout': False,
        'number_of_backtests': None,
        'backtests': None,
        'use_cross_series_features': None,
        'aggregation_type': None,
        'cross_series_group_by_columns': None,
        'calendar_id': None,
        'use_time_series': False,
        'series_id': None,
        'metric': None,
        'target': None,
        'mode': dr.AUTOPILOT_MODE.FULL_AUTO,  # MANUAL #QUICK
        'date_col': None,
        'fd_start': None,
        'fd_end': None,
        'fdw_start': None,
        'fdw_end': None,
    }

    for s in ts_settings.items():
        settings[s[0]] = s[1]

    df[settings['date_col']] = pd.to_datetime(df[settings['date_col']])

    if settings['max_date'] is None:
        settings['max_date'] = df[settings['date_col']].max()
    else:
        settings['max_date'] = pd.to_datetime(settings['max_date'])

    if ts_settings['known_in_advance']:
        settings['known_in_advance'] = [
            dr.FeatureSettings(feat_name, known_in_advance=True)
            for feat_name in settings['known_in_advance']
        ]

    # Update validation and holdout duration, start, and end date
    project_time_unit, project_time_step = get_timestep(df, settings)

    validation_durations = {'minute': 0, 'hour': 0, 'day': 0, 'month': 0}
    holdout_durations = {'minute': 0, 'hour': 0, 'day': 0, 'month': 0}

    if project_time_unit == 'minute':
        validation_durations['minute'] = settings['validation_duration']
        holdout_durations['minute'] = settings['holdout_duration']

    elif project_time_unit == 'hour':
        validation_durations['hour'] = settings['validation_duration']
        holdout_durations['hour'] = settings['holdout_duration']

    elif project_time_unit == 'day':
        validation_durations['day'] = settings['validation_duration']
        holdout_durations['day'] = settings['holdout_duration']

    elif project_time_unit == 'week':
        validation_durations['day'] = settings['validation_duration'] * 7
        holdout_durations['day'] = settings['holdout_duration'] * 7

    elif project_time_unit == 'month':
        validation_durations['day'] = settings['validation_duration'] * 31
        holdout_durations['day'] = settings['holdout_duration'] * 31

    else:
        raise ValueError(f'{project_time_unit} is not a supported timestep')

    if settings['disable_holdout']:
        settings['holdout_duration'] = None
        settings['holdout_start_date'] = None
    else:
        settings['holdout_start_date'] = settings['max_date'] - dt.timedelta(
            minutes=holdout_durations['minute'],
            hours=holdout_durations['hour'],
            days=holdout_durations['day'],
        )

        settings[
            'holdout_duration'] = dr.partitioning_methods.construct_duration_string(
                minutes=holdout_durations['minute'],
                hours=holdout_durations['hour'],
                days=holdout_durations['day'],
            )

    ###############################
    # Create Datetime Specification
    ###############################
    time_partition = dr.DatetimePartitioningSpecification(
        feature_settings=settings['known_in_advance'],
        # gap_duration = dr.partitioning_methods.construct_duration_string(years=0, months=0, days=0),
        validation_duration=dr.partitioning_methods.construct_duration_string(
            minutes=validation_durations['minute'],
            hours=validation_durations['hour'],
            days=validation_durations['day'],
        ),
        datetime_partition_column=settings['date_col'],
        use_time_series=settings['use_time_series'],
        disable_holdout=settings[
            'disable_holdout'],  # set this if disable_holdout is set to False
        holdout_start_date=settings['holdout_start_date'],
        holdout_duration=settings[
            'holdout_duration'],  # set this if disable_holdout is set to False
        multiseries_id_columns=[settings['series_id']],
        forecast_window_start=int(settings['fd_start']),
        forecast_window_end=int(settings['fd_end']),
        feature_derivation_window_start=int(settings['fdw_start']),
        feature_derivation_window_end=int(settings['fdw_end']),
        number_of_backtests=settings['num_backtests'],
        calendar_id=settings['calendar_id'],
        use_cross_series_features=settings['use_cross_series_features'],
        aggregation_type=settings['aggregation_type'],
        cross_series_group_by_columns=settings[
            'cross_series_group_by_columns'],
    )

    ################
    # Create Project
    ################
    project = dr.Project.create(project_name=project_name,
                                sourcedata=df,
                                max_wait=14400,
                                read_timeout=14400)

    print(f'Creating project {project_name} ...')

    #################
    # Start Autopilot
    #################
    project.set_target(
        target=settings['target'],
        metric=settings['metric'],
        mode=settings['mode'],
        advanced_options=opts,
        worker_count=-1,
        partitioning_method=time_partition,
        max_wait=14400,
    )

    return project
# set up time series partition settings
if s.fields_known_in_advance:
    feature_settings = [
        FeatureSettings(feat_name, known_in_advance=True)
        for feat_name in s.fields_known_in_advance
    ]
else:
    feature_settings = None

time_partition = dr.DatetimePartitioningSpecification(
    datetime_partition_column=s.field_date,
    use_time_series=True,
    feature_derivation_window_start=s.feature_derivation_window_start,
    feature_derivation_window_end=s.feature_derivation_window_end,
    validation_duration=s.validation_length,
    gap_duration=s.gap_length,
    forecast_window_start=s.forecast_window_start,
    forecast_window_end=s.forecast_window_end,
    number_of_backtests=s.number_of_backtests,
    feature_settings=feature_settings)

# Create a feature list from Informative Features excluding from Feature to remove
fl_informative_features = [
    fl for fl in project.get_featurelists()
    if fl.name == 'Informative Features'
][0]
if s.fields_exclude_from_modeling:
    subset_of_features = list(
        set(fl_informative_features.features) -
        set(s.fields_exclude_from_modeling))
示例#7
0
def run_multi_target_model_factory(project_name, path_to_data, target_cols,
                                   dr_mode, config_file):

    # SET UP LOCATION FOR STORING RESULTS
    ensure_results_dir(project_name)
    results_file = start_results_file(project_name)

    # TURN THE TARGETS INTO A LIST FOR ITERATION
    target_list = target_cols.split(',')

    # EXTRACT THE COLUMN HEADERS AS A SET OF FEATURES
    df = pd.read_csv(path_to_data)
    features = df.columns.tolist()
    for t in target_list:
        features.remove(t)

    mode_config = dr_mode.split(':')
    # #######################################################################
    # TODO: Need to force DataRobot Login with supplied credentials
    # #########################################################################

    for targ in target_list:
        temp_name = project_name + "_" + targ
        project = dr.Project.create(sourcedata=path_to_data,
                                    project_name=temp_name)
        flist = project.create_featurelist('features', features)

        if (mode_config[0] == 'OTV'):

            if (len(mode_config) > 2):
                partition = dr.DatetimePartitioningSpecification(
                    datetime_partition_column=mode_config[1],
                    gap_duration=mode_config[2])
            else:
                partition = dr.DatetimePartitioningSpecification(
                    datetime_partition_column=mode_config[1])
            project.set_target(target=targ,
                               partitioning_method=partition,
                               featurelist_id=flist.id,
                               worker_count=-1)
            project.wait_for_autopilot()

        elif (mode_config[0] == 'GRPD'):

            partition = dr.GroupCV(holdout_pct=10,
                                   reps=5,
                                   partition_key_cols=[mode_config[1]])
            project.set_target(target=targ,
                               partitioning_method=partition,
                               featurelist_id=flist.id,
                               worker_count=-1)
            project.wait_for_autopilot()

        elif (mode_config[0] == 'QUICK'):
            project.set_target(target=targ,
                               mode='Quick',
                               featurelist_id=flist.id,
                               worker_count=-1)
            project.wait_for_autopilot()
        else:
            project.set_target(target=targ,
                               featurelist_id=flist.id,
                               worker_count=-1)
            project.wait_for_autopilot()

        # ONCE THE PROJECT COMPLETES WE NEED TO CHOOSE THE MODEL, DEPLOY IT AND STORE THE RESULTS
        project.unlock_holdout()
        metric = project.metric
        model = dr.models.ModelRecommendation.get(project.id).get_model()
        # THE OLD WAY JUST TOOK THE MODEL AT THE TOP OF THE LEADERBOARD LIST
        #model = project.get_models()[0]
        holdout_score = model.metrics[project.metric]['holdout']
        prediction_server = dr.PredictionServer.list()[0]
        deployment_title = project_name + " - " + targ
        deployment = dr.Deployment.create_from_learning_model(
            model.id,
            label=deployment_title,
            description='Dashboard Factory Model for ' + targ,
            default_prediction_server_id=prediction_server.id)

        write_model_results(results_file, targ, project.id, model.id,
                            deployment.id, metric, holdout_score)

    close_results(results_file)

    #print("BASE MODE: ",   mode_config[0])
    #print("BASE TARGET: ", target_list[0])
    print("\nrun_multi_target_model_factory Completed")