def data_to_sqa(self, data: Data, trial_index: Optional[int], timestamp: int) -> SQAData: """Convert Ax data to SQLAlchemy.""" # pyre-fixme: Expected `Base` for 1st...ot `typing.Type[Data]`. data_class: SQAData = self.config.class_to_sqa_class[Data] import json # pyre-fixme[29]: `SQAData` is not a function. return data_class( id=data.db_id, data_json=data.true_df.to_json(), description=data.description, time_created=timestamp, trial_index=trial_index, structure_metadata_json=json.dumps( object_to_json(data.serialize_init_args(data))), )
def attach_data( self, data: Data, combine_with_last_data: bool = False, overwrite_existing_data: bool = False, ) -> int: """Attach data to experiment. Stores data in `experiment._data_by_trial`, to be looked up via `experiment.lookup_data_for_trial`. Args: data: Data object to store. combine_with_last_data: By default, when attaching data, it's identified by its timestamp, and `experiment.lookup_data_for_trial` returns data by most recent timestamp. Sometimes, however, we want to combine the data from multiple calls to `attach_data` into one dataframe. This might be because: - We attached data for some metrics at one point and data for the rest of the metrics later on. - We attached data for some fidelity at one point and data for another fidelity later one. To achieve that goal, set `combine_with_last_data` to `True`. In this case, we will take the most recent previously attached data, append the newly attached data to it, attach a new Data object with the merged result, and delete the old one. Afterwards, calls to `lookup_data_for_trial` will return this new combined data object. This operation will also validate that the newly added data does not contain observations for metrics that already have observations at the same fidelity in the most recent data. overwrite_existing_data: By default, we keep around all data that has ever been attached to the experiment. However, if we know that the incoming data contains all the information we need for a given trial, we can replace the existing data for that trial, thereby reducing the amount we need to store in the database. Returns: Timestamp of storage in millis. """ if combine_with_last_data and overwrite_existing_data: raise UnsupportedError( "Cannot set both combine_with_last_data=True and " "overwrite_existing_data=True. Data can either be " "combined, or overwritten, or neither.") data_type = type(data) data_init_args = data.serialize_init_args(data) if data.df.empty: raise ValueError("Data to attach is empty.") metrics_not_on_exp = set(data.true_df["metric_name"].values) - set( self.metrics.keys()) if metrics_not_on_exp: logger.info( f"Attached data has some metrics ({metrics_not_on_exp}) that are " "not among the metrics on this experiment. Note that attaching data " "will not automatically add those metrics to the experiment. " "For these metrics to be automatically fetched by `experiment." "fetch_data`, add them via `experiment.add_tracking_metric` or update " "the experiment's optimization config.") cur_time_millis = current_timestamp_in_millis() for trial_index, trial_df in data.true_df.groupby( data.true_df["trial_index"]): current_trial_data = (self._data_by_trial[trial_index] if trial_index in self._data_by_trial else OrderedDict()) if combine_with_last_data and len(current_trial_data) > 0: last_ts, last_data = list(current_trial_data.items())[-1] last_data_type = type(last_data) merge_keys = ["trial_index", "metric_name", "arm_name" ] + (last_data.map_keys if issubclass( last_data_type, MapData) else []) merged = pd.merge( last_data.true_df, trial_df, on=merge_keys, how="inner", ) if not merged.empty: raise ValueError( f"Last data for trial {trial_index} already contained an " f"observation for metric {merged.head()['metric_name']}." ) del current_trial_data[last_ts] current_trial_data[ cur_time_millis] = last_data_type.from_multiple_data([ last_data, last_data_type(trial_df, **data_init_args), ]) elif overwrite_existing_data: if len(current_trial_data) > 0: _, last_data = list(current_trial_data.items())[-1] last_data_metrics = set(last_data.df["metric_name"]) new_data_metrics = set(trial_df["metric_name"]) if last_data_metrics.difference(new_data_metrics): raise ValueError( "overwrite_trial_data is True, but the new data contains " "only a subset of the metrics that are present in the " "previous data.") current_trial_data = OrderedDict( {cur_time_millis: data_type(trial_df, **data_init_args)}) else: current_trial_data[cur_time_millis] = data_type( trial_df, **data_init_args) self._data_by_trial[trial_index] = current_trial_data return cur_time_millis