def get_mutual_information(self, dataset_identifier: int, series_identifier: int): """ Calculates the mutual information of a single series with all other columns in a dataset :param dataset_identifier: Unique identifier of a dataset. :param series_identifier: Unique identifier of a column :return: """ dataset_summary = self.get_dataset(dataset_identifier) names = [ col.name for col in dataset_summary.analysis if col.id_ == series_identifier ] if len(names) == 0: raise ValueError("Invalid series identifier specified") series_name = names[0] mutual_information_data = self.client.get( Endpoints.SINGLE_SERIES_MUTUAL_INFORMATION_WITH_OTHER_SERIES( dataset_identifier, series_identifier)) mutual_information_data = pd.DataFrame.from_dict( mutual_information_data["data"]) mutual_information_data.columns = ["Series", "Mutual Information"] mutual_information_data.name = series_name return mutual_information_data
def get_correlations(self, dataset_identifier: int, series_identifier: int): """ Calculates the pearson correlation of a single series with every other series in a dataset :param dataset_identifier: Unique identifier of a dataset. :param series_identifier: Unique identifier of a column :return: """ dataset_summary = self.get_dataset(dataset_identifier) names = [ col.name for col in dataset_summary.analysis if col.id_ == series_identifier ] if len(names) == 0: raise ValueError("Invalid series identifier specified") series_name = names[0] correlation_data = self.client.get( Endpoints.SINGLE_SERIES_CORRELATIONS_WITH_OTHER_SERIES( dataset_identifier, series_identifier)) correlations = pd.DataFrame.from_dict(correlation_data["data"]) correlations.columns = ["Series", "Pearson Correlation"] correlations.name = series_name return correlations
def download_feature_info_for_stage( self, pipeline_id: int, stage_id: int) -> Dict[str, pd.DataFrame]: """ Downloads the feature data as a data frame. WARNING: Total size of data is limited to 100mb * number of horizons (i.e. 2GB if you don't override the class checks for maximum number of horizons that can be selected!). :param pipeline_id: ID of a pipeline :param stage_id: ID of a stage :return: Dictionary of Dataframes of feature data with the column names being the transformed features. """ pipeline = self.get_single_pipeline(pipeline_id) problem_specification_stage = pipeline.find_stage_by_type( StageType.problem_specification)[0] horizons = cast(ProblemSpecificationConfig, problem_specification_stage.config).horizons feature_df_dict = {} for horizon in tqdm(horizons, desc="Fetching Data"): data = self.client.get( Endpoints.FEATURE_DATA_FOR_STAGE(pipeline_id=pipeline_id, stage_id=stage_id, horizon=horizon), download=True, ) feature_df_dict[str(horizon)] = pd.read_csv(StringIO(data), index_col="time") terminal_messages.print_success( f"Retrieved Feature Data for Pipeline {pipeline_id} and Stage {stage_id}" ) return feature_df_dict
def add_stage_to_pipeline( self, pipeline_id: int, parent_stage_id: int, stage_type: StageType, ) -> Pipeline: """ EXPERIMENTAL Adds a stage after specified parent_stage_id. :param pipeline_id: Unique pipeline identifier :param parent_stage_id: ID of the stage preceding desired location of added stage :param stage_type: Type of stage to add :return: Pipeline with new stage added """ body = {"parentStage": parent_stage_id, "stageType": stage_type.name} pipeline = construct_pipeline_class( self.client.put( Endpoints.STAGES(pipeline_id), json=body, )) return self.get_single_pipeline(pipeline.summary.id_)
def get_dataset(self, identifier: int) -> IndividualDataset: """ Gets a single data set's meta data. :param identifier: dataset id as returned from upload_dataset or list_all_datasets. :return: Individual data set sans data """ response = self.client.get(Endpoints.SINGLE_DATASET(identifier)) individual_dataset_dictionary = response column_data = [ ColumnPassport(**convert_dict_from_camel_to_snake(col)) for col in individual_dataset_dictionary["analysis"] ] dataset = IndividualDataset( analysis=column_data, summary=DatasetSummary( **convert_dict_from_camel_to_snake( individual_dataset_dictionary["summary"]), ), ) dataset.summary.columns = [ RawColumn(name=col.name, id_=col.id_, is_text=col.is_text, is_binary=col.is_binary) for col in column_data ] return dataset
def update_config(self, pipeline_id: int, stage_id: int, config: StageConfig): """ Updates the configuration of a stage. All dependent insights will be reset. :param pipeline_id: ID of a pipeline :param stage_id: ID of a stage :param config: stage config :return: """ pipeline = self.get_single_pipeline(pipeline_id) stages_matching_id = [ stage for stage in pipeline.stages if stage.id_ == stage_id ] assert len( stages_matching_id) == 1, "No stage found with given identifier" assert config.valid_configuration_values, "Invalid numeric configuration specified" config_dict = dict(config=convert_dict_from_snake_to_camel( json.loads(config.as_json())), preview=False) self.client.put( Endpoints.UPDATE_STAGE_CONFIGURATION(pipeline_id, stage_id), json=config_dict, )
def run_pipeline( self, pipeline_id: int, synchronous: bool = False, verbose: bool = True, ) -> Pipeline: """ Runs a single pipeline with the given ID. WARNING: If synchronous=False then please make sure not to overload the number of fire and forget workers. :param pipeline_id: Unique pipeline identifier :param synchronous: If synchronous, waits for the pipeline to complete before returning. :param verbose: If false, suppress output :return: Pipeline object (completed if synchronous) """ pipeline = self.get_single_pipeline(pipeline_id) if pipeline.is_complete: terminal_messages.print_failure( f"Pipeline {pipeline_id} not run - already complete") self.client.post(Endpoints.RUN_PIPELINE(pipeline_id)) if synchronous: self.wait_for_pipeline_completion(pipeline_ids=[pipeline_id], verbose=verbose) pipeline = self.get_single_pipeline(pipeline_id) return pipeline
def download_backtest_info_for_stage( self, pipeline_id: int, stage_id: int, verbose=True) -> Dict[str, pd.DataFrame]: """ Downloads the backtest data of a backtest stage as a data frame. Only validation data is shown. df columns: - truth: the true value at the given time stamp - mean: mean prediction at the given time stamp - bound_low: lower bound prediction at the given time stamp (3std) - bound_high: higher bound prediction at the given time stamp (3std) - backtest: The backtest number. This is set by the n_backtests configuration in the backtest stage. - verbose: Log output to terminal? WARNING: This is not the same as the expert_backtests; the backtests are finite and discrete here. For every-point-rolling retrain backtests please run the expert backtest function, which can backtest with retrains between any two arbitrary rows. :param pipeline_id: ID of a pipeline :param stage_id: ID of a stage - MUST BE A BACKTEST STAGE :return: Dictionary of Dataframe of backtest data, indexed by Horizon. """ pipeline = self.get_single_pipeline(pipeline_id) problem_specification_stage = pipeline.find_stage_by_type( StageType.problem_specification)[0] horizons = cast(ProblemSpecificationConfig, problem_specification_stage.config).horizons backtest_df_dict = {} if verbose: pbar = tqdm(desc="Fetching Data") pbar.total = len(horizons) else: pbar = None for horizon in horizons: if pbar and verbose: pbar.update() data = self.client.get( Endpoints.BACKTEST_DATA_FOR_STAGE(pipeline_id=pipeline_id, stage_id=stage_id, horizon=horizon), download=True, ) backtest_df_dict[str(horizon)] = pd.read_csv(StringIO(data), index_col="time") if verbose: terminal_messages.print_success( f"Retrieved Feature Backtest for Pipeline {pipeline_id} and Stage {stage_id}" ) return backtest_df_dict
def get_single_pipeline(self, pipeline_id: int) -> Pipeline: """ Gets a summary of all pipelines currently owned by the current user. :return: A list of Pipelines """ data_interface = DataInterface(self.client) pipeline = self.client.get( Endpoints.SINGLE_PIPELINE(pipeline_id=pipeline_id)) pipeline = construct_pipeline_class(pipeline) dataset = data_interface.get_dataset(pipeline.dataset.id_) pipeline.dataset = dataset.summary return pipeline
def rename_dataset(self, identifier: int, name: str): """ Renames an already existing dataset :param identifier: id of a dataset :param name: The new name for the dataset :return: """ assert len( name) < 100, "Name too long. Please keep to under 100 chars." self.client.put(Endpoints.RENAME_DATASET(identifier), body={"newName": name})
def delete_datasets(self, identifiers: List[int] = None): """ Deletes data sets as identified by their identifiers. These may be retrieved by calling DataInterface.list_datasets. :param identifiers: list of numeric identifiers :return: """ pbar = tqdm(identifiers) for identifier in pbar: pbar.set_description(f"Deleting Data Set with ID: {identifier}") self.client.delete(Endpoints.SINGLE_DATASET(identifier))
def delete_pipelines(self, pipeline_ids: List[int]): """ Deletes pipelines as identified by their identifiers. These may be retrieved by calling DataInterface.list_pipelines. :param pipeline_ids: list of numeric pipeline identifiers :return: """ pbar = tqdm(pipeline_ids) for identifier in pbar: pbar.set_description( f"Deleting Pipeline Set with ID: {identifier}") # self.client.delete(Endpoints.SINGLE_PIPELINE(identifier)) self.client.delete(Endpoints.SINGLE_PIPELINE(identifier))
def get_series_data_sampled(self, dataset_identifier: int, series_identifier: int): """ Retrieves sampled data of a particular series in a data set. Suitable for plotting. In the case of intra-day data this data is aggregated into a daily plot. :param dataset_identifier: Unique identifier of a dataset. :param series_identifier: Unique identifier of a column :return: """ response = self.client.get( Endpoints.SINGLE_SERIES(dataset_identifier, series_identifier)) return convert_dict_from_camel_to_snake(response)
def get_feature_info_for_stage(self, pipeline_id: int, stage_id: int) -> pd.DataFrame: """ Returns a list of the features that have passed a given stage, and their associated transforms and metadata. :param pipeline_id: ID of a pipeline :param stage_id: ID of a stage :return: Dataframe containing the feature metadata (with each row being a feature) """ response = self.client.get( Endpoints.FEATURES_FOR_STAGE( pipeline_id=pipeline_id, stage_id=stage_id, )) df = pd.DataFrame.from_records(response["nodesAndLinks"]) df["active"].astype(bool) return df
def get_insight_for_stage(self, pipeline_id: int, stage_id: int) -> Dict[str, Any]: """ Fetches the high-level output results for a stage. Feature set information is retrieved using get_features_for_stage - the insights here are concerned more with the bigger picture. :param pipeline_id: ID of a pipeline :param stage_id: ID of a stage :return: Stage insights in dictionary form """ response = self.client.get( Endpoints.INSIGHTS_FOR_STAGE( pipeline_id=pipeline_id, stage_id=stage_id, )) return convert_dict_from_camel_to_snake(response)
def get_autocorrelation(self, dataset_identifier: int, series_identifier: int): """ Calculates the autocorrelation functon of a single series :param dataset_identifier: Unique identifier of a dataset. :param series_identifier: Unique identifier of a column :returndT: """ dataset_summary = self.get_dataset(dataset_identifier) names = [ col.name for col in dataset_summary.analysis if col.id_ == series_identifier ] if len(names) == 0: raise ValueError("Invalid series identifier specified") series_name = names[0] acf = self.client.get( Endpoints.SINGLE_SERIES_AUTOCORRELATION(dataset_identifier, series_identifier)) acf_df = pd.DataFrame(acf["data"]) acf_df.columns = ["Lag", f"Correlation: f{series_name}"] return acf_df
def fit_predict(self, pipeline_id: int, data: pd.DataFrame, horizon=-1): """ Predicts with new data. Accepts a dataframe with column headers named identically to original data set used to train the pipeline Date column must be a column in the dataframe :param horizon: Horizon ahead to predict at. Must be a horizon that is selected in the forecast specification of the pipeline. :param pipeline_id: unique pipeline identifier :param data: dataframe for prediction :return: Returns the predictions at the specified horizon """ str_buffer = io.StringIO(data.to_csv(encoding="utf-8", index=False)) str_buffer.seek(0) str_buffer.name = "test_data" pipeline = self.get_single_pipeline(pipeline_id) regressor_type = cast( PredictionStageConfig, pipeline.find_stage_by_type( StageType.prediction)[0].config).regressor if horizon == -1: horizon = cast(ProblemSpecificationConfig, pipeline.stages[0].config).horizons[0] options = { "alignTo": "", "missingDataStrategy": { "ffill": { "enabled": False }, "replaceMissing": { "enabled": False, "replaceWith": 1 }, }, } files = dict(file=str_buffer, follow_redirects=True) body = dict(options=json.dumps(options), horizon=horizon, regressor=regressor_type.name) response = self.client.post( Endpoints.PREDICT_FOR_SINGLE_PIPELINE_AND_HORIZON(pipeline_id), files=files, body=body, ) predictions = [] for target_data in response: predictions.append( Predictions( mean=pd.Series(target_data["predictions"]["mean"]["data"]), cb_low=pd.Series( target_data["predictions"]["cbLow"]["data"]), cb_high=pd.Series( target_data["predictions"]["cbHigh"]["data"]), confidence=pd.Series( target_data["predictions"]["confidence"]), regressor_importances=target_data["predictions"] ["regressorImportances"], name=target_data["targetOriginalColumnName"], ).data) return pd.concat(predictions).T
def run_expert_backtest_between_two_rows(self, horizon: int, start_row: int, end_row: int, n_training_rows_for_backtest: int, pipeline_id: int, stage_id: int, verbose=True): """ EXPERT FUNCTIONALITY - Not exposed in the Horizon User Interface! WARNING: This function contains no guards to ensure that the rows are not in the feature training data. The method run_expert_backtest_for_validation_data ensures that the backtests are run over valid rows. Runs a rolling retrain between two rows. This is a synchronous request that might take a very long time to compute; n different models are trained, where there are n points in the training data. df columns: - truth: the true value at the given time stamp - mean: mean prediction at the given time stamp - bound_low: lower bound prediction at the given time stamp (3std) - bound_high: higher bound prediction at the given time stamp (3std) - backtest: The backtest number. This is set by the n_backtests configuration in the backtest stage. - timestamps: Timestamp :param horizon: Forecast horizon to run backtests over :param start_row: Row to start backtest :param end_row: Row to backtest to :param n_training_rows_for_backtest: Number of rows to train on for each rolling train / backtest :param pipeline_id: ID of a pipeline :param stage_id: ID of a stage :param verbose: print to console :return: Dataframe of backtest results """ if verbose: terminal_messages.print_expert_message( f"Initialising Backtest from row {start_row} to row {end_row} (Pipeline {pipeline_id})" ) response = self.client.get( Endpoints.EXPERT_BACKTEST_FOR_STAGE_AND_HORIZON( pipeline_id=pipeline_id, horizon=horizon, first_row=start_row, last_row=end_row, n_training_rows=n_training_rows_for_backtest, stage_id=stage_id, )) if verbose: terminal_messages.print_success("Expert Backtest Complete") df = pd.DataFrame.from_dict( convert_dict_from_camel_to_snake(response), ) df.drop("neg_rmse", axis=1, inplace=True) df.set_index("timestamps", inplace=True) df.index = pd.to_datetime(df.index) return df
def upload_data( self, data: pd.DataFrame, name: str, forward_fill_missing_values: bool = False, replace_missing_values: bool = False, align_to_column: str = "", ) -> IndividualDataset: """ Uploads the given data set to the Horizon API. :param align_to_column: Aligns data to column if the data is misaligned. This should be selected as the target if data is misaligned or has missing values. Selecting this will also cause missing data in the specified column to be dropped. :param data: DataFrame to be uploaded :param name: Name of the data set to be uploaded :param forward_fill_missing_values: Forward-fill missing values :param replace_missing_values: Replace missing values :return: A summary of the uploaded data set. """ str_buffer = io.StringIO(data.to_csv(encoding="utf-8", index=False)) str_buffer.seek(0) str_buffer.name = name if forward_fill_missing_values and not align_to_column: print_warning( "Forward-fill select without alignment to column. Please be aware that " "if you choose a target column that has been forward-filled this will yield " "scientifically inaccurate results") options = { "alignTo": align_to_column, "missingDataStrategy": { "ffill": { "enabled": forward_fill_missing_values }, "replaceMissing": { "enabled": replace_missing_values, "replaceWith": 1 }, }, } request_data = dict(file=str_buffer, follow_redirects=True) data = dict(options=json.dumps(options)) response = self.client.post( endpoint=Endpoints.UPLOAD_DATA, body=data, files=request_data, on_success_message=f"Data set '{name}' uploaded. Analyzing...", ) ingestion_process = IngestionProcess( **convert_dict_from_camel_to_snake(response)) while ingestion_process.status not in ["completed", "error"]: sleep(0.5) response = self.client.get( endpoint=Endpoints.SINGLE_INGESTION_PROCESS( ingestion_process.id_), ) ingestion_process = IngestionProcess( **convert_dict_from_camel_to_snake(response)) if ingestion_process.status == "error": raise ValueError( f"Error analyzing data\n{ingestion_process.error}") return self.get_dataset(ingestion_process.dataset_id)
import requests from mf_horizon_client.client.datasets.data_interface import DataInterface from mf_horizon_client.client.pipelines.pipeline_interface import PipelineInterface from mf_horizon_client.client.session import HorizonSession from mf_horizon_client.client.warnings import Warnings from mf_horizon_client.endpoints import Endpoints DEFAULT_MAX_RETRIES = 3 DEFAULT_CONCURRENT_TASKS = 1 ENDPOINTS = Endpoints() class HorizonClient(HorizonSession): """Sets up a connection to Horizon. Args: server_url (str): URL of your Horizon server api_key (str): Your personal API key max_retries (int, default 3): How many times to retry a request if a connection error occurs. max_concurrent_pipelines (str, default 1): The maximum number of pipelines that may be run at any one time. This must be set up from the deployment configuration. """ def __init__( self, server_url: str, api_key: str, max_retries: int = DEFAULT_MAX_RETRIES, max_concurrent_pipelines: int = DEFAULT_CONCURRENT_TASKS, ) -> None: