def __init__(self, df_path: str, forecast_total: int, use_real_precip=True, use_real_temp=True, target_supplied=True, interpolate=False, sort_column_clone=None, **kwargs): """ :param str df_path: A data loader for the test data. """ super().__init__(**kwargs) df_path = get_data(df_path) self.original_df = pd.read_csv(df_path) if interpolate: self.original_df = interpolate_dict[interpolate["method"]]( self.original_df, **interpolate["params"]) if sort_column_clone: self.original_df = self.original_df.sort_values( by=sort_column_clone) print("CSV Path below") print(df_path) self.forecast_total = forecast_total self.use_real_temp = use_real_temp self.use_real_precip = use_real_precip self.target_supplied = target_supplied # Convert back to datetime and save index self.original_df["datetime"] = self.original_df["datetime"].astype( "datetime64[ns]") self.original_df["original_index"] = self.original_df.index if len(self.relevant_cols3) > 0: self.original_df[self.relevant_cols3] = self.df[ self.relevant_cols3]
def __init__( self, model_base: str, training_data: str, validation_data: str, test_data: str, params: Dict): self.params = params if "weight_path" in params: params["weight_path"] = get_data(params["weight_path"]) self.model = self.load_model(model_base, params["model_params"], params["weight_path"]) else: self.model = self.load_model(model_base, params["model_params"]) params["dataset_params"]["forecast_test_len"] = params["inference_params"]["hours_to_forecast"] self.training = self.make_data_load(training_data, params["dataset_params"], "train") self.validation = self.make_data_load(validation_data, params["dataset_params"], "valid") self.test_data = self.make_data_load(test_data, params["dataset_params"], "test") if "GCS" in self.params and self.params["GCS"]: self.gcs_client = get_storage_client() else: self.gcs_client = None self.wandb = self.wandb_init()
def __init__(self, file_path: str, forecast_history: int, forecast_length: int, target_col: List, relevant_cols: List, scaling=None, start_stamp: int = 0, end_stamp: int = None, gcp_service_key: Optional[str] = None, interpolate_param: bool = True, sort_column=None, feature_params=None): """ A data loader that takes a CSV file and properly batches for use in training/eval a PyTorch model :param file_path: The path to the CSV file you wish to use. :param forecast_history: This is the length of the historical time series data you wish to utilize for forecasting :param forecast_length: The number of time steps to forecast ahead (for transformer this must equal history_length) :param relevant_cols: Supply column names you wish to predict in the forecast (others will not be used) :param target_col: The target column or columns you to predict. If you only have one still use a list ['cfs'] :param scaling: (highly reccomended) If provided should be a subclass of sklearn.base.BaseEstimator and sklearn.base.TransformerMixin) i.e StandardScaler, MaxAbsScaler, MinMaxScaler, etc) Note without a scaler the loss is likely to explode and cause infinite loss which will corrupt weights :param start_stamp int: Optional if you want to only use part of a CSV for training, validation or testing supply these :param end_stamp int: Optional if you want to only use part of a CSV for training, validation, or testing supply these :param sort_column str: The column to sort the time series on prior to forecast. """ super().__init__() interpolate = interpolate_param self.forecast_history = forecast_history self.forecast_length = forecast_length print("interpolate should be below") self.local_file_path = get_data(file_path, gcp_service_key) df = pd.read_csv(self.local_file_path) relevant_cols3 = [] if sort_column: df[sort_column] = pd.to_datetime(df[sort_column]) df = df.sort_values(by=sort_column) if feature_params: df, relevant_cols3 = feature_fix(feature_params, sort_column, df) print("Relevant cols are") print(relevant_cols3) self.relevant_cols3 = relevant_cols3 if interpolate: interpolated_df = interpolate_dict[interpolate["method"]]( df, **interpolate["params"]) self.df = interpolated_df[relevant_cols + relevant_cols3] else: self.df = df[relevant_cols + relevant_cols3] print("Now loading" + file_path) self.original_df = df self.scale = None if start_stamp != 0 and end_stamp is not None: self.df = self.df[start_stamp:end_stamp] elif start_stamp != 0: self.df = self.df[start_stamp:] elif end_stamp is not None: self.df = self.df[:end_stamp] if scaling is not None: print("scaling now") self.scale = scaling temp_df = self.scale.fit_transform(self.df[relevant_cols]) # We define a second scaler to scale the end output # back to normal as models might not necessarily predict # other present time series values. targ_scale_class = self.scale.__class__ self.targ_scaler = targ_scale_class() print(len(target_col)) if len(target_col) == 1: self.targ_scaler.fit_transform( self.df[target_col[0]].values.reshape(-1, 1)) else: self.targ_scaler.fit_transform(self.df[target_col]) self.df[relevant_cols] = temp_df if (len(self.df) - self.df.count()).max() != 0: print( "Error nan values detected in data. Please run interpolate ffill or bfill on data" ) self.targ_col = target_col self.df.to_csv("temp_df.csv")