def segment(dir_path): """Create segments of time series.""" target = yaml.safe_load(open("params.yaml"))["clean"]["target"] filepaths = find_files(dir_path, file_extension=".csv") output_columns = np.array( pd.read_csv(DATA_PATH / OUTPUT_FEATURES_PATH, index_col=0)).reshape(-1) dfs = [] for filepath in filepaths: df = pd.read_csv(filepath, index_col=0) # df = df.iloc[10000:90000,:] # df = df.iloc[:,:-1] dfs.append(df) combined_df = pd.concat(dfs, ignore_index=True) combined_df = combined_df[::10] print(combined_df) n_rows = len(combined_df) segment_size = 100 n_segments = int(n_rows / segment_size) ids = np.arange(1, n_segments + 1, 1) idlist = np.ones(segment_size) for i in ids[1:]: idlist = np.concatenate((idlist, np.ones(segment_size) * i)) idlist = np.array(idlist, dtype=np.int32) # combined_df = combined_df.iloc[:len(idlist),:] # combined_df["id"] = idlist combined_df["id"] = np.ones(n_rows) # y = [] # for i in ids: # target_value = combined_df[combined_df["id"] == i][target].iloc[-1] # y.append(target_value) # y = pd.Series(y) # y.index = y.index + 1 # combined_df.index.name = "index" # print(y) print(combined_df) # print(np.unique(y)) df_rolled = roll_time_series(combined_df, column_id="id", column_sort=None) print(df_rolled)
def clean(dir_path): """Clean up inputs. Args: dir_path (str): Path to directory containing files. """ # Load parameters dataset = yaml.safe_load(open("params.yaml"))["profile"]["dataset"] """Name of data set, which must be the name of subfolder of 'assets/data/raw', in where to look for data.""" combine_files = yaml.safe_load( open("params.yaml"))["clean"]["combine_files"] # If no name of data set is given, all files present in 'assets/data/raw' # will be used. if dataset != None: dir_path += "/" + dataset filepaths = find_files(dir_path, file_extension=".csv") DATA_CLEANED_PATH.mkdir(parents=True, exist_ok=True) # Find removable variables from profiling report removable_variables = parse_profile_warnings() dfs = [] for filepath in filepaths: # Read csv df = pd.read_csv(filepath, index_col=0) for c in removable_variables: del df[c] df.dropna(inplace=True) dfs.append(df) combined_df = pd.concat(dfs, ignore_index=True) if combine_files: combined_df.to_csv(DATA_CLEANED_PATH / (os.path.basename(dataset + "-cleaned.csv"))) else: for filepath, df in zip(filepaths, dfs): df.to_csv(DATA_CLEANED_PATH / (os.path.basename(filepath).replace(".", "-cleaned.")))
def combine(dir_path): """Combine data from multiple input files into one dataset. Args: dir_path (str): Path to directory containing files. """ filepaths = find_files(dir_path, file_extension=".npz") DATA_COMBINED_PATH.mkdir(parents=True, exist_ok=True) train_inputs = [] train_outputs = [] test_inputs = [] test_outputs = [] calibrate_inputs = [] calibrate_outputs = [] for filepath in filepaths: infile = np.load(filepath) if "train" in filepath: train_inputs.append(infile["X"]) train_outputs.append(infile["y"]) elif "test" in filepath: test_inputs.append(infile["X"]) test_outputs.append(infile["y"]) elif "calibrate" in filepath: calibrate_inputs.append(infile["X"]) calibrate_outputs.append(infile["y"]) X_train = np.concatenate(train_inputs) y_train = np.concatenate(train_outputs) X_test = np.concatenate(test_inputs) y_test = np.concatenate(test_outputs) if len(calibrate_inputs) > 0: X_calibrate = np.concatenate(calibrate_inputs) y_calibrate = np.concatenate(calibrate_outputs) np.savez(DATA_COMBINED_PATH / "train.npz", X=X_train, y=y_train) np.savez(DATA_COMBINED_PATH / "test.npz", X=X_test, y=y_test) if len(calibrate_inputs) > 0: np.savez(DATA_COMBINED_PATH / "calibrate.npz", X=X_calibrate, y=y_calibrate)
def profile(dir_path): """Creates a profile report of a data set. Reads data from a set of input files, and creates a report containing profiling of the data. This profiling consists of various statistical properties. The report is stored in two formats: - HTML: For visual inspection - JSON: For subsequent automatic processing of results Args: dir_path (str): Path to directory containing files. """ dataset = yaml.safe_load(open("params.yaml"))["profile"]["dataset"] """Name of data set, which must be the name of subfolder of 'assets/data/raw', in where to look for data.""" # If no name of data set is given, all files present in 'assets/data/raw' # will be used. if dataset != None: dir_path += "/" + dataset filepaths = find_files(dir_path, file_extension=".csv") dfs = [] for filepath in filepaths: dfs.append(pd.read_csv(filepath)) combined_df = pd.concat(dfs, ignore_index=True) # Generate report. profile = ProfileReport(combined_df, title="Profiling Analysis", config_file="src/profile.yaml", lazy=False, sort=None) # Create folder for profiling report PROFILE_PATH.mkdir(parents=True, exist_ok=True) # Save report to files. profile.to_file(PROFILE_PATH / "profile.html") profile.to_file(PROFILE_PATH / "profile.json")
def sequentialize(dir_path): filepaths = find_files(dir_path, file_extension=".npz") DATA_SEQUENTIALIZED_PATH.mkdir(parents=True, exist_ok=True) params = yaml.safe_load(open("params.yaml"))["sequentialize"] net = yaml.safe_load(open("params.yaml"))["train"]["net"] hist_size = params["hist_size"] target_size = params["target_size"] if target_size > hist_size: raise ValueError("target_size cannot be larger than hist_size.") for filepath in filepaths: infile = np.load(filepath) X = infile["X"] y = infile["y"] # Combine y and X to get correct format for sequentializing data = np.hstack((y, X)) # Split into sequences X, y = split_sequences(data, hist_size, target_size=target_size) if net == "dnn": X = flatten_sequentialized(X) # Save X and y into a binary file np.savez( DATA_SEQUENTIALIZED_PATH / (os.path.basename(filepath).replace( "scaled.csv", "sequentialized.npz")), X=X, y=y, )
def split(dir_path): """Split data into train and test set. Training files and test files are saved to different folders. Args: dir_path (str): Path to directory containing files. """ params = yaml.safe_load(open("params.yaml"))["split"] DATA_SPLIT_PATH.mkdir(parents=True, exist_ok=True) filepaths = find_files(dir_path, file_extension=".csv") # Handle special case where there is only one workout file. if isinstance(filepaths, str) or len(filepaths) == 1: filepath = filepaths[0] df = pd.read_csv(filepath, index_col=0) train_size = int(len(df) * params["train_split"]) # This is used when using conformal predictors. # It specifies the calibration set size. # Set to 0 in params.yml if no calibration is to be done. calibrate_size = int(len(df) * params["calibrate_split"]) df_train = None df_test = None df_calibrate = None if params["calibrate_split"] == 0: df_train = df.iloc[:train_size] df_test = df.iloc[train_size:] else: df_train = df.iloc[:train_size] df_calibrate = df.iloc[train_size:train_size + calibrate_size] df_test = df.iloc[train_size + calibrate_size:] df_train.to_csv( DATA_SPLIT_PATH / (os.path.basename(filepath).replace("featurized", "train"))) df_test.to_csv( DATA_SPLIT_PATH / (os.path.basename(filepath).replace("featurized", "test"))) if params["calibrate_split"] != 0: df_calibrate.to_csv(DATA_SPLIT_PATH / (os.path.basename(filepath).replace( "featurized", "calibrate"))) else: # Parameter 'train_split' is used to find out no. of files in training set file_split = int(len(filepaths) * params["train_split"]) file_split_calibrate = int(len(filepaths) * params["calibrate_split"]) training_files = [] test_files = [] calibrate_files = [] if file_split_calibrate == 0: training_files = filepaths[:file_split] test_files = filepaths[file_split:] else: training_files = filepaths[:file_split] calibrate_files = filepaths[file_split:file_split + file_split_calibrate] test_files = filepaths[file_split + file_split_calibrate:] for filepath in filepaths: df = pd.read_csv(filepath, index_col=0) if filepath in training_files: df.to_csv(DATA_SPLIT_PATH / (os.path.basename(filepath).replace( "featurized", "train"))) elif filepath in test_files: df.to_csv( DATA_SPLIT_PATH / (os.path.basename(filepath).replace("featurized", "test"))) elif filepath in calibrate_files: df.to_csv(DATA_SPLIT_PATH / (os.path.basename(filepath).replace( "featurized", "calibrate")))
def clean(dir_path, save_results_to_file=True): """Clean up inputs. Args: dir_path (str): Path to directory containing files. save_results_to_file (bool): When creating a virtual sensor, the results should be saved to file for more efficient reruns of the pipeline. When running the virtual sensor, there is no need to save these intermediate results to file. """ # Load parameters dataset = yaml.safe_load(open("params.yaml"))["profile"]["dataset"] params = yaml.safe_load(open("params.yaml")) combine_files = params["clean"]["combine_files"] target = params["clean"]["target"] classification = params["clean"]["classification"] onehot_encode_target = params["clean"]["onehot_encode_target"] # If no name of data set is given, all files present in 'assets/data/raw' # will be used. if dataset is not None: dir_path += "/" + dataset filepaths = find_files(dir_path, file_extension=".csv") DATA_CLEANED_PATH.mkdir(parents=True, exist_ok=True) # Find removable variables from profiling report removable_variables = parse_profile_warnings() dfs = [] for filepath in filepaths: # Read csv df = pd.read_csv(filepath) # If the first column is an index column, remove it. if df.iloc[:, 0].is_monotonic: df = df.iloc[:, 1:] for column in removable_variables: del df[column] df.dropna(inplace=True) dfs.append(df) combined_df = pd.concat(dfs, ignore_index=True) if classification: if onehot_encode_target and len(np.unique(combined_df[target])) > 2: encoder = LabelBinarizer() else: if onehot_encode_target: raise ValueError( "Parameter 'onehot_encode_target' is set to True, but target is binary. Change parameter to False in order to use this pipeline." ) encoder = LabelEncoder() target_col = np.array(combined_df[target]).reshape(-1) encoder.fit(target_col) # print(f"Classes: {encoder.classes_}") # print(f"Encoded classes: {encoder.transform(encoder.classes_)}") combined_df, output_columns = encode_target(encoder, combined_df, target) for i in range(len(dfs)): dfs[i], _ = encode_target(encoder, dfs[i], target) else: output_columns = [target] if combine_files: combined_df.to_csv(DATA_CLEANED_PATH / (os.path.basename("data-cleaned.csv"))) else: for filepath, df in zip(filepaths, dfs): df.to_csv( DATA_CLEANED_PATH / (os.path.basename(filepath).replace(".", "-cleaned.")) ) pd.DataFrame(output_columns).to_csv(DATA_PATH / OUTPUT_FEATURES_PATH)
def featurize(dir_path): """Clean up inputs and add features to data set. Args: dir_path (str): Path to directory containing files. """ # Load parameters params = yaml.safe_load(open("params.yaml"))["featurize"] features = params["features"] """Features to include in data set.""" target = yaml.safe_load(open("params.yaml"))["clean"]["target"] """Variable to use as target.""" filepaths = find_files(dir_path, file_extension=".csv") DATA_FEATURIZED_PATH.mkdir(parents=True, exist_ok=True) # Read all data to fit one-hot encoder dfs = [] for filepath in filepaths: df = pd.read_csv(filepath) dfs.append(df) combined_df = pd.concat(dfs, ignore_index=True) categorical_variables = find_categorical_variables() print(f"Columns: {combined_df.columns}") print(f"Cat: {categorical_variables}") # Check if some categorical variables have been removed in the cleaning # process, and if so, remove them from the list # removables = [] # for v in categorical_variables: # if v not in combined_df.columns: # removables.append(v) # # categorical_variables.remove(v) # print(removables) # categorical_variables.remove(removables) # print(f"Cat: {categorical_variables}") # print(combined_df[categorical_variables]) # categorical_encoder = OneHotEncoder() # categorical_encoder.fit(combined_df) for filepath in filepaths: # Read csv df = pd.read_csv(filepath) # Move target column to the beginning of dataframe df = move_column(df, column_name=target, new_idx=0) # If no features are specified, use all columns as features # TODO: Maybe not the most robust way to test this if type(params["features"]) != list: features = df.columns # Check if wanted features from params.yaml exists in the data for feature in features: if feature not in df.columns: print(f"Feature {feature} not found!") # TODO: Engineer features. At the moment no engineered features exists! df = add_features(df, features) for col in df.columns: # Remove feature from input. This is useful in the case that a raw # feature is used to engineer a feature, but the raw feature itself # should not be a part of the input. if col not in features and col != target: del df[col] # Remove feature if it is non-numeric elif not is_numeric_dtype(df[col]): del df[col] # Save data df.to_csv(DATA_FEATURIZED_PATH / (os.path.basename(filepath).replace(".", "-featurized."))) # Save list of features used pd.DataFrame(df.columns).to_csv(DATA_PATH / "input_columns.csv")
def scale(dir_path): """Scale training and test data. Args: dir_path (str): Path to directory containing files. """ filepaths = find_files(dir_path, file_extension=".csv") DATA_SCALED_PATH.mkdir(parents=True, exist_ok=True) params = yaml.safe_load(open("params.yaml"))["scale"] input_method = params["input"] output_method = params["output"] if input_method == "standard": scaler = StandardScaler() elif input_method == "minmax": scaler = MinMaxScaler() elif input_method == "robust": scaler = RobustScaler() elif input_method == "none": scaler = StandardScaler() else: raise NotImplementedError(f"{input_method} not implemented.") if output_method == "standard": output_scaler = StandardScaler() elif output_method == "minmax": output_scaler = MinMaxScaler() elif output_method == "robust": output_scaler = RobustScaler() elif output_method == "none": output_scaler = StandardScaler() else: raise NotImplementedError(f"{output_method} not implemented.") train_inputs = [] train_outputs = [] data_overview = {} for filepath in filepaths: df = pd.read_csv(filepath, index_col=0) # Convert to numpy data = df.to_numpy() # Split into input (X) and output/target (y) X = data[:, 1:].copy() y = data[:, 0].copy().reshape(-1, 1) if "train" in filepath: train_inputs.append(X) train_outputs.append(y) category = "train" elif "test" in filepath: category = "test" elif "calibrate" in filepath: category = "calibrate" data_overview[filepath] = {"X": X, "y": y, "category": category} X_train = np.concatenate(train_inputs) y_train = np.concatenate(train_outputs) # Fit a scaler to the training data scaler = scaler.fit(X_train) output_scaler = output_scaler.fit(y_train) for filepath in data_overview: # Scale inputs if input_method == "none": X=data_overview[filepath]["X"] else: X = scaler.transform(data_overview[filepath]["X"]) # Scale outputs if output_method == "none": y = data_overview[filepath]["y"] else: y = output_scaler.transform(data_overview[filepath]["y"]) # Save X and y into a binary file np.savez( DATA_SCALED_PATH / ( os.path.basename(filepath).replace( data_overview[filepath]["category"] + ".csv", data_overview[filepath]["category"] + "-scaled.npz" ) ), #X=data_overview[filepath]["X"], X = X, # y = data_overview[filepath]["y"] y = y )